{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-06-16T15:08:46.778915Z",
     "start_time": "2024-06-16T15:08:12.270103Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: openai in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.28.1)\r\n",
      "Requirement already satisfied: requests>=2.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai) (2.31.0)\r\n",
      "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai) (4.66.2)\r\n",
      "Requirement already satisfied: aiohttp in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai) (3.9.5)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai) (2024.2.2)\r\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai) (1.3.1)\r\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai) (23.2.0)\r\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai) (1.4.1)\r\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai) (6.0.5)\r\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai) (1.9.4)\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: langchain in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.2.5)\r\n",
      "Requirement already satisfied: PyYAML>=5.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (6.0.1)\r\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (2.0.30)\r\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (3.9.5)\r\n",
      "Requirement already satisfied: langchain-core<0.3.0,>=0.2.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (0.2.7)\r\n",
      "Requirement already satisfied: langchain-text-splitters<0.3.0,>=0.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (0.2.1)\r\n",
      "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (0.1.77)\r\n",
      "Requirement already satisfied: numpy<2.0.0,>=1.26.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (1.26.4)\r\n",
      "Requirement already satisfied: pydantic<3,>=1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (2.7.1)\r\n",
      "Requirement already satisfied: requests<3,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (2.31.0)\r\n",
      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain) (8.3.0)\r\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\r\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\r\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\r\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\r\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\r\n",
      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain-core<0.3.0,>=0.2.7->langchain) (1.33)\r\n",
      "Requirement already satisfied: packaging<25,>=23.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langchain-core<0.3.0,>=0.2.7->langchain) (23.2)\r\n",
      "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.3)\r\n",
      "Requirement already satisfied: annotated-types>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\r\n",
      "Requirement already satisfied: pydantic-core==2.18.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3,>=1->langchain) (2.18.2)\r\n",
      "Requirement already satisfied: typing-extensions>=4.6.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<3,>=1->langchain) (4.11.0)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3,>=2->langchain) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3,>=2->langchain) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3,>=2->langchain) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests<3,>=2->langchain) (2024.2.2)\r\n",
      "Requirement already satisfied: jsonpointer>=1.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.3.0,>=0.2.7->langchain) (2.4)\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: tiktoken in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.7.0)\r\n",
      "Requirement already satisfied: regex>=2022.1.18 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tiktoken) (2024.4.16)\r\n",
      "Requirement already satisfied: requests>=2.26.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tiktoken) (2.31.0)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.26.0->tiktoken) (2024.2.2)\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: faiss-cpu in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (1.8.0)\r\n",
      "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from faiss-cpu) (1.26.4)\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting unstructured\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/57/a5/a961aad3ca3657ab61c6002c60f9a1b0357d52c268a43ad828f47f7d801a/unstructured-0.14.6-py3-none-any.whl (2.0 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m2.0/2.0 MB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting chardet (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl (199 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m199.4/199.4 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting filetype (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl (19 kB)\r\n",
      "Collecting python-magic (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl (13 kB)\r\n",
      "Collecting lxml (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/26/36/6e00905cb4de2d014f4a62df58f0e82d262b5461245d951a6e7442b0222a/lxml-5.2.2-cp312-cp312-macosx_10_9_universal2.whl (8.2 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m8.2/8.2 MB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting nltk (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/a6/0a/0d20d2c0f16be91b9fa32a77b76c60f9baf6eba419e5ef5deca17af9c582/nltk-3.8.1-py3-none-any.whl (1.5 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.5/1.5 MB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: tabulate in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (0.9.0)\r\n",
      "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (2.31.0)\r\n",
      "Requirement already satisfied: beautifulsoup4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (4.12.3)\r\n",
      "Collecting emoji (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/e6/90/20ad30babfa8f2b5ab46281d8e17bdfdbb3ac294cda14d525b9c2d958846/emoji-2.12.1-py3-none-any.whl (431 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m431.4/431.4 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: dataclasses-json in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (0.6.6)\r\n",
      "Collecting python-iso639 (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/01/08/5e649cf18dec750d498c53c6c8eb1d9790752ebd50fa7f7e69cc0c277cfe/python_iso639-2024.4.27-py3-none-any.whl (274 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m274.7/274.7 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting langdetect (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m981.5/981.5 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25h  Preparing metadata (setup.py) ... \u001B[?25ldone\r\n",
      "\u001B[?25hRequirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (1.26.4)\r\n",
      "Collecting rapidfuzz (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/11/8c/f83bb24d1961ef9124c5491dfcc678426d9f7e3025207de824cdac056642/rapidfuzz-3.9.3-cp312-cp312-macosx_11_0_arm64.whl (1.5 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.5/1.5 MB\u001B[0m \u001B[31m1.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: backoff in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (2.2.1)\r\n",
      "Requirement already satisfied: typing-extensions in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (4.11.0)\r\n",
      "Collecting unstructured-client (from unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/04/82/8e0efc977e6df124cd2b752bd4c5c52125f8760b0d6df384d76933738ef3/unstructured_client-0.23.5-py3-none-any.whl (40 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m41.0/41.0 kB\u001B[0m \u001B[31m1.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: wrapt in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (1.16.0)\r\n",
      "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured) (4.66.2)\r\n",
      "Requirement already satisfied: soupsieve>1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from beautifulsoup4->unstructured) (2.5)\r\n",
      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from dataclasses-json->unstructured) (3.21.2)\r\n",
      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from dataclasses-json->unstructured) (0.9.0)\r\n",
      "Requirement already satisfied: six in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langdetect->unstructured) (1.16.0)\r\n",
      "Requirement already satisfied: click in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from nltk->unstructured) (8.1.7)\r\n",
      "Collecting joblib (from nltk->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b70104dcfe2fb51/joblib-1.4.2-py3-none-any.whl (301 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m301.8/301.8 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: regex>=2021.8.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from nltk->unstructured) (2024.4.16)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->unstructured) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->unstructured) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->unstructured) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->unstructured) (2024.2.2)\r\n",
      "Collecting deepdiff>=6.0 (from unstructured-client->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/18/e6/d27d37dc55dbf40cdbd665aa52844b065ac760c9a02a02265f97ea7a4256/deepdiff-7.0.1-py3-none-any.whl (80 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m80.8/80.8 kB\u001B[0m \u001B[31m1.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: httpx>=0.27.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured-client->unstructured) (0.27.0)\r\n",
      "Collecting jsonpath-python>=1.0.6 (from unstructured-client->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/16/8a/d63959f4eff03893a00e6e63592e3a9f15b9266ed8e0275ab77f8c7dbc94/jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)\r\n",
      "Requirement already satisfied: mypy-extensions>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured-client->unstructured) (1.0.0)\r\n",
      "Requirement already satisfied: nest-asyncio>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured-client->unstructured) (1.6.0)\r\n",
      "Requirement already satisfied: packaging>=23.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured-client->unstructured) (23.2)\r\n",
      "Collecting pypdf>=4.0 (from unstructured-client->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/c9/d1/450b19bbdbb2c802f554312c62ce2a2c0d8744fe14735bc70ad2803578c7/pypdf-4.2.0-py3-none-any.whl (290 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m290.4/290.4 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from unstructured-client->unstructured) (2.9.0.post0)\r\n",
      "Collecting requests-toolbelt>=1.0.0 (from unstructured-client->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m54.5/54.5 kB\u001B[0m \u001B[31m1.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n",
      "\u001B[?25hCollecting ordered-set<4.2.0,>=4.1.0 (from deepdiff>=6.0->unstructured-client->unstructured)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/33/55/af02708f230eb77084a299d7b08175cff006dea4f2721074b92cdb0296c0/ordered_set-4.1.0-py3-none-any.whl (7.6 kB)\r\n",
      "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.27.0->unstructured-client->unstructured) (3.7.1)\r\n",
      "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.27.0->unstructured-client->unstructured) (1.0.5)\r\n",
      "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.27.0->unstructured-client->unstructured) (1.3.1)\r\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpcore==1.*->httpx>=0.27.0->unstructured-client->unstructured) (0.14.0)\r\n",
      "Building wheels for collected packages: langdetect\r\n",
      "  Building wheel for langdetect (setup.py) ... \u001B[?25ldone\r\n",
      "\u001B[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=237647202462c873b14cfe56c3b15f7be93c045284638358d34c0371fc6860d7\r\n",
      "  Stored in directory: /Users/wenzhixin/Library/Caches/pip/wheels/c1/92/74/6e9618b1a98b373b2a8d8c8105265a6c974b1a0099d154ea95\r\n",
      "Successfully built langdetect\r\n",
      "Installing collected packages: filetype, rapidfuzz, python-magic, python-iso639, pypdf, ordered-set, lxml, langdetect, jsonpath-python, joblib, emoji, chardet, requests-toolbelt, nltk, deepdiff, unstructured-client, unstructured\r\n",
      "Successfully installed chardet-5.2.0 deepdiff-7.0.1 emoji-2.12.1 filetype-1.2.0 joblib-1.4.2 jsonpath-python-1.0.6 langdetect-1.0.9 lxml-5.2.2 nltk-3.8.1 ordered-set-4.1.0 pypdf-4.2.0 python-iso639-2024.4.27 python-magic-0.4.27 rapidfuzz-3.9.3 requests-toolbelt-1.0.0 unstructured-0.14.6 unstructured-client-0.23.5\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting jq\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/bf/bb/579fa205ce3c774b031d971f99c602372657a44600c9086d9f603e1367ac/jq-1.7.0-cp312-cp312-macosx_11_0_arm64.whl (411 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m411.4/411.4 kB\u001B[0m \u001B[31m1.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hInstalling collected packages: jq\r\n",
      "Successfully installed jq-1.7.0\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install openai\n",
    "%pip install langchain\n",
    "%pip install tiktoken\n",
    "%pip install faiss-cpu\n",
    "%pip install unstructured\n",
    "%pip install jq"
   ]
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "import os\n",
    "import getpass\n",
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"输入openAi-key\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-15T15:18:02.644607Z",
     "start_time": "2024-07-15T15:17:43.410889Z"
    }
   },
   "id": "78207be1e8637c11",
   "execution_count": 1
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 1. csv loader"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "42d3884b652f9ebe"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='球队: 国民队\\n工资(百万): 81.34\\n胜场数: 98', metadata={'source': './file/test.csv', 'row': 0}), Document(page_content='球队: 红人队\\n工资(百万): 82.20\\n胜场数: 97', metadata={'source': './file/test.csv', 'row': 1}), Document(page_content='球队: 洋基队\\n工资(百万): 197.96\\n胜场数: 95', metadata={'source': './file/test.csv', 'row': 2}), Document(page_content='球队: 巨人队\\n工资(百万): 117.62\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 3}), Document(page_content='球队: 勇士队\\n工资(百万): 83.31\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 4}), Document(page_content='球队: 运动家队\\n工资(百万): 55.37\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 5}), Document(page_content='球队: 游骑兵队\\n工资(百万): 120.51\\n胜场数: 93', metadata={'source': './file/test.csv', 'row': 6}), Document(page_content='球队: 金莺队\\n工资(百万): 81.43\\n胜场数: 93', metadata={'source': './file/test.csv', 'row': 7}), Document(page_content='球队: 光芒队\\n工资(百万): 64.17\\n胜场数: 90', metadata={'source': './file/test.csv', 'row': 8}), Document(page_content='球队: 天使队\\n工资(百万): 154.49\\n胜场数: 89', metadata={'source': './file/test.csv', 'row': 9}), Document(page_content='球队: 老虎队\\n工资(百万): 132.30\\n胜场数: 88', metadata={'source': './file/test.csv', 'row': 10}), Document(page_content='球队: 红雀队\\n工资(百万): 110.30\\n胜场数: 88', metadata={'source': './file/test.csv', 'row': 11}), Document(page_content='球队: 道奇队\\n工资(百万): 95.14\\n胜场数: 86', metadata={'source': './file/test.csv', 'row': 12}), Document(page_content='球队: 白袜队\\n工资(百万): 96.92\\n胜场数: 85', metadata={'source': './file/test.csv', 'row': 13}), Document(page_content='球队: 啤酒人队\\n工资(百万): 97.65\\n胜场数: 83', metadata={'source': './file/test.csv', 'row': 14}), Document(page_content='球队: 费城人队\\n工资(百万): 174.54\\n胜场数: 81', metadata={'source': './file/test.csv', 'row': 15}), Document(page_content='球队: 钻石backs队\\n工资(百万): 74.28\\n胜场数: 81', metadata={'source': './file/test.csv', 'row': 16}), Document(page_content='球队: 海盗队\\n工资(百万): 63.43\\n胜场数: 79', metadata={'source': './file/test.csv', 'row': 17}), Document(page_content='球队: 神父队\\n工资(百万): 55.24\\n胜场数: 76', metadata={'source': './file/test.csv', 'row': 18}), Document(page_content='球队: 水手队\\n工资(百万): 81.97\\n胜场数: 75', metadata={'source': './file/test.csv', 'row': 19}), Document(page_content='球队: 大都会队\\n工资(百万): 93.35\\n胜场数: 74', metadata={'source': './file/test.csv', 'row': 20}), Document(page_content='球队: 蓝鸟队\\n工资(百万): 75.48\\n胜场数: 73', metadata={'source': './file/test.csv', 'row': 21}), Document(page_content='球队: 皇家队\\n工资(百万): 60.91\\n胜场数: 72', metadata={'source': './file/test.csv', 'row': 22}), Document(page_content='球队: 马林鱼队\\n工资(百万): 118.07\\n胜场数: 69', metadata={'source': './file/test.csv', 'row': 23}), Document(page_content='球队: 红袜队\\n工资(百万): 173.18\\n胜场数: 69', metadata={'source': './file/test.csv', 'row': 24}), Document(page_content='球队: 印第安人队\\n工资(百万): 78.43\\n胜场数: 68', metadata={'source': './file/test.csv', 'row': 25}), Document(page_content='球队: 双城队\\n工资(百万): 94.08\\n胜场数: 66', metadata={'source': './file/test.csv', 'row': 26}), Document(page_content='球队: 落矶队\\n工资(百万): 78.06\\n胜场数: 64', metadata={'source': './file/test.csv', 'row': 27}), Document(page_content='球队: 小熊队\\n工资(百万): 88.19\\n胜场数: 61', metadata={'source': './file/test.csv', 'row': 28}), Document(page_content='球队: 太空人队\\n工资(百万): 60.65\\n胜场数: 55', metadata={'source': './file/test.csv', 'row': 29})]\n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import  CSVLoader\n",
    "\n",
    "#初始化CSVLoader 实例，指定需要加载的文件\n",
    "loader = CSVLoader(file_path=\"./file/test.csv\")\n",
    "#调用load方法加载数据，\n",
    "data = loader.load()\n",
    "#打印加载的数据\n",
    "print(data)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-16T15:14:39.916372Z",
     "start_time": "2024-06-16T15:14:39.716654Z"
    }
   },
   "id": "2bbaecf670949957",
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "f733e7f27d893e12"
  },
  {
   "cell_type": "markdown",
   "source": [
    "delimiter：这是一个字符，用于分隔CSV文件中的字段。通常，大多数CSV文件使用逗号作为分隔符，但也有可能使用其他字符，如制表符（'\\t'）或分号（';'）。\n",
    "\n",
    "quotechar：这是一个字符，用于封装那些包含特殊字符（如分隔符、换行符或其他引号字符）的字段。常见的引号字符是双引号（'\"'）。如果字段中包含特殊字符，那么这个字段会被quotechar定义的字符封装起来。例如，如果你的分隔符是逗号，而你有一个字段是\"John, Smith\"，那么这个字段会被写成\"\"John, Smith\"\"，以防止逗号被误解为字段的分隔符。"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "201300b4fff664af"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='球队: 球队\\n工资(百万): 工资(百万)\\n胜场数: 胜场数', metadata={'source': './file/test.csv', 'row': 0}), Document(page_content='球队: 国民队\\n工资(百万): 81.34\\n胜场数: 98', metadata={'source': './file/test.csv', 'row': 1}), Document(page_content='球队: 红人队\\n工资(百万): 82.20\\n胜场数: 97', metadata={'source': './file/test.csv', 'row': 2}), Document(page_content='球队: 洋基队\\n工资(百万): 197.96\\n胜场数: 95', metadata={'source': './file/test.csv', 'row': 3}), Document(page_content='球队: 巨人队\\n工资(百万): 117.62\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 4}), Document(page_content='球队: 勇士队\\n工资(百万): 83.31\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 5}), Document(page_content='球队: 运动家队\\n工资(百万): 55.37\\n胜场数: 94', metadata={'source': './file/test.csv', 'row': 6}), Document(page_content='球队: 游骑兵队\\n工资(百万): 120.51\\n胜场数: 93', metadata={'source': './file/test.csv', 'row': 7}), Document(page_content='球队: 金莺队\\n工资(百万): 81.43\\n胜场数: 93', metadata={'source': './file/test.csv', 'row': 8}), Document(page_content='球队: 光芒队\\n工资(百万): 64.17\\n胜场数: 90', metadata={'source': './file/test.csv', 'row': 9}), Document(page_content='球队: 天使队\\n工资(百万): 154.49\\n胜场数: 89', metadata={'source': './file/test.csv', 'row': 10}), Document(page_content='球队: 老虎队\\n工资(百万): 132.30\\n胜场数: 88', metadata={'source': './file/test.csv', 'row': 11}), Document(page_content='球队: 红雀队\\n工资(百万): 110.30\\n胜场数: 88', metadata={'source': './file/test.csv', 'row': 12}), Document(page_content='球队: 道奇队\\n工资(百万): 95.14\\n胜场数: 86', metadata={'source': './file/test.csv', 'row': 13}), Document(page_content='球队: 白袜队\\n工资(百万): 96.92\\n胜场数: 85', metadata={'source': './file/test.csv', 'row': 14}), Document(page_content='球队: 啤酒人队\\n工资(百万): 97.65\\n胜场数: 83', metadata={'source': './file/test.csv', 'row': 15}), Document(page_content='球队: 费城人队\\n工资(百万): 174.54\\n胜场数: 81', metadata={'source': './file/test.csv', 'row': 16}), Document(page_content='球队: 钻石backs队\\n工资(百万): 74.28\\n胜场数: 81', metadata={'source': './file/test.csv', 'row': 17}), Document(page_content='球队: 海盗队\\n工资(百万): 63.43\\n胜场数: 79', metadata={'source': './file/test.csv', 'row': 18}), Document(page_content='球队: 神父队\\n工资(百万): 55.24\\n胜场数: 76', metadata={'source': './file/test.csv', 'row': 19}), Document(page_content='球队: 水手队\\n工资(百万): 81.97\\n胜场数: 75', metadata={'source': './file/test.csv', 'row': 20}), Document(page_content='球队: 大都会队\\n工资(百万): 93.35\\n胜场数: 74', metadata={'source': './file/test.csv', 'row': 21}), Document(page_content='球队: 蓝鸟队\\n工资(百万): 75.48\\n胜场数: 73', metadata={'source': './file/test.csv', 'row': 22}), Document(page_content='球队: 皇家队\\n工资(百万): 60.91\\n胜场数: 72', metadata={'source': './file/test.csv', 'row': 23}), Document(page_content='球队: 马林鱼队\\n工资(百万): 118.07\\n胜场数: 69', metadata={'source': './file/test.csv', 'row': 24}), Document(page_content='球队: 红袜队\\n工资(百万): 173.18\\n胜场数: 69', metadata={'source': './file/test.csv', 'row': 25}), Document(page_content='球队: 印第安人队\\n工资(百万): 78.43\\n胜场数: 68', metadata={'source': './file/test.csv', 'row': 26}), Document(page_content='球队: 双城队\\n工资(百万): 94.08\\n胜场数: 66', metadata={'source': './file/test.csv', 'row': 27}), Document(page_content='球队: 落矶队\\n工资(百万): 78.06\\n胜场数: 64', metadata={'source': './file/test.csv', 'row': 28}), Document(page_content='球队: 小熊队\\n工资(百万): 88.19\\n胜场数: 61', metadata={'source': './file/test.csv', 'row': 29}), Document(page_content='球队: 太空人队\\n工资(百万): 60.65\\n胜场数: 55', metadata={'source': './file/test.csv', 'row': 30})]\n"
     ]
    }
   ],
   "source": [
    "loader = CSVLoader(file_path=\"./file/test.csv\",csv_args = {\n",
    "    'delimiter':',',\n",
    "    'quotechar':'\"',\n",
    "    'fieldnames':['球队', '工资(百万)', '胜场数']\n",
    "})\n",
    "data = loader.load()\n",
    "print(data)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-16T15:24:03.041984Z",
     "start_time": "2024-06-16T15:24:03.028918Z"
    }
   },
   "id": "bc0303e73f62e86e",
   "execution_count": 5
  },
  {
   "cell_type": "markdown",
   "source": [
    "制定文档对应的字段"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "23ea7e8cce7aa7da"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Document(page_content='球队: 国民队\\n工资(百万): 81.34\\n胜场数: 98', metadata={'source': '国民队', 'row': 0}), Document(page_content='球队: 红人队\\n工资(百万): 82.20\\n胜场数: 97', metadata={'source': '红人队', 'row': 1}), Document(page_content='球队: 洋基队\\n工资(百万): 197.96\\n胜场数: 95', metadata={'source': '洋基队', 'row': 2}), Document(page_content='球队: 巨人队\\n工资(百万): 117.62\\n胜场数: 94', metadata={'source': '巨人队', 'row': 3}), Document(page_content='球队: 勇士队\\n工资(百万): 83.31\\n胜场数: 94', metadata={'source': '勇士队', 'row': 4}), Document(page_content='球队: 运动家队\\n工资(百万): 55.37\\n胜场数: 94', metadata={'source': '运动家队', 'row': 5}), Document(page_content='球队: 游骑兵队\\n工资(百万): 120.51\\n胜场数: 93', metadata={'source': '游骑兵队', 'row': 6}), Document(page_content='球队: 金莺队\\n工资(百万): 81.43\\n胜场数: 93', metadata={'source': '金莺队', 'row': 7}), Document(page_content='球队: 光芒队\\n工资(百万): 64.17\\n胜场数: 90', metadata={'source': '光芒队', 'row': 8}), Document(page_content='球队: 天使队\\n工资(百万): 154.49\\n胜场数: 89', metadata={'source': '天使队', 'row': 9}), Document(page_content='球队: 老虎队\\n工资(百万): 132.30\\n胜场数: 88', metadata={'source': '老虎队', 'row': 10}), Document(page_content='球队: 红雀队\\n工资(百万): 110.30\\n胜场数: 88', metadata={'source': '红雀队', 'row': 11}), Document(page_content='球队: 道奇队\\n工资(百万): 95.14\\n胜场数: 86', metadata={'source': '道奇队', 'row': 12}), Document(page_content='球队: 白袜队\\n工资(百万): 96.92\\n胜场数: 85', metadata={'source': '白袜队', 'row': 13}), Document(page_content='球队: 啤酒人队\\n工资(百万): 97.65\\n胜场数: 83', metadata={'source': '啤酒人队', 'row': 14}), Document(page_content='球队: 费城人队\\n工资(百万): 174.54\\n胜场数: 81', metadata={'source': '费城人队', 'row': 15}), Document(page_content='球队: 钻石backs队\\n工资(百万): 74.28\\n胜场数: 81', metadata={'source': '钻石backs队', 'row': 16}), Document(page_content='球队: 海盗队\\n工资(百万): 63.43\\n胜场数: 79', metadata={'source': '海盗队', 'row': 17}), Document(page_content='球队: 神父队\\n工资(百万): 55.24\\n胜场数: 76', metadata={'source': '神父队', 'row': 18}), Document(page_content='球队: 水手队\\n工资(百万): 81.97\\n胜场数: 75', metadata={'source': '水手队', 'row': 19}), Document(page_content='球队: 大都会队\\n工资(百万): 93.35\\n胜场数: 74', metadata={'source': '大都会队', 'row': 20}), Document(page_content='球队: 蓝鸟队\\n工资(百万): 75.48\\n胜场数: 73', metadata={'source': '蓝鸟队', 'row': 21}), Document(page_content='球队: 皇家队\\n工资(百万): 60.91\\n胜场数: 72', metadata={'source': '皇家队', 'row': 22}), Document(page_content='球队: 马林鱼队\\n工资(百万): 118.07\\n胜场数: 69', metadata={'source': '马林鱼队', 'row': 23}), Document(page_content='球队: 红袜队\\n工资(百万): 173.18\\n胜场数: 69', metadata={'source': '红袜队', 'row': 24}), Document(page_content='球队: 印第安人队\\n工资(百万): 78.43\\n胜场数: 68', metadata={'source': '印第安人队', 'row': 25}), Document(page_content='球队: 双城队\\n工资(百万): 94.08\\n胜场数: 66', metadata={'source': '双城队', 'row': 26}), Document(page_content='球队: 落矶队\\n工资(百万): 78.06\\n胜场数: 64', metadata={'source': '落矶队', 'row': 27}), Document(page_content='球队: 小熊队\\n工资(百万): 88.19\\n胜场数: 61', metadata={'source': '小熊队', 'row': 28}), Document(page_content='球队: 太空人队\\n工资(百万): 60.65\\n胜场数: 55', metadata={'source': '太空人队', 'row': 29})]\n"
     ]
    }
   ],
   "source": [
    "loader = CSVLoader(file_path=\"./file/test.csv\",source_column = \"球队\")\n",
    "data = loader.load()\n",
    "print(data)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-16T15:27:56.059550Z",
     "start_time": "2024-06-16T15:27:56.053606Z"
    }
   },
   "id": "e1fa59bf1d1e3c7c",
   "execution_count": 7
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 2. 加载同目录下所有文档"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "da76115f216c88a3"
  },
  {
   "cell_type": "markdown",
   "source": [
    "在这段代码中：\n",
    "\n",
    "1. `.` 表示当前目录，`..` 表示父目录。因此，`'../'` 就表示当前目录的上一级目录。 \n",
    "\n",
    "2. `glob=\"**/*.md\"` 是一个glob模式，用于匹配文件路径。在这个模式中：\n",
    "\n",
    "   - `**` 表示任意数量的目录。它可以匹配零个、一个或多个子目录。例如，如果你有一个文件路径`/a/b/c.txt`，那么`**`可以匹配`/a/`、`/a/b/` 或者空。\n",
    "\n",
    "   - `*.md` 表示任意的 `.md` 文件。`*` 是一个通配符，表示任意数量的任意字符，`.md` 表示以 `.md` 结尾的文件。因此，`*.md` 可以匹配任何以 `.md` 结尾的文件。"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "3085af09bfcb412c"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文档内容:\n",
      " 标题\n",
      "\n",
      "我是一个Markdown文件的示例。 \n",
      "\n",
      "元数据:\n",
      " {'source': 'file/markdown.md'} \n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import DirectoryLoader\n",
    "\n",
    "# loader = DirectoryLoader(\"./file\",glob=\"**/*.md\")\n",
    "loader = DirectoryLoader('./file', glob=\"**/*.md\")\n",
    "document = loader.load()\n",
    "\n",
    "for doc in document:\n",
    "    print(f\"文档内容:\\n {doc.page_content} \\n\")\n",
    "    print(f\"元数据:\\n {doc.metadata} \\n\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-17T16:09:19.907047Z",
     "start_time": "2024-06-17T16:09:19.894702Z"
    }
   },
   "id": "8cee447795a61dce",
   "execution_count": 30
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 2.1 多线程加载文件"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "9a784ac8820562"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文档内容:\n",
      " # learn-llm\n",
      "\n",
      "#### 介绍\n",
      "大模型学习\n",
      "\n",
      "#### 软件架构\n",
      "软件架构说明\n",
      "\n",
      "\n",
      "#### 安装教程\n",
      "\n",
      "1.  xxxx\n",
      "2.  xxxx\n",
      "3.  xxxx\n",
      "\n",
      "#### 使用说明\n",
      "\n",
      "1.  xxxx\n",
      "2.  xxxx\n",
      "3.  xxxx\n",
      "\n",
      "#### 参与贡献\n",
      "\n",
      "1.  Fork 本仓库\n",
      "2.  新建 Feat_xxx 分支\n",
      "3.  提交代码\n",
      "4.  新建 Pull Request\n",
      "\n",
      "\n",
      "#### 特技\n",
      "\n",
      "1.  使用 Readme\\_XXX.md 来支持不同的语言，例如 Readme\\_en.md, Readme\\_zh.md\n",
      "2.  Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com)\n",
      "3.  你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目\n",
      "4.  [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目，是综合评定出的优秀开源项目\n",
      "5.  Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help)\n",
      "6.  Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)\n",
      " \n",
      "\n",
      "元数据:\n",
      " {'source': '../README.md'} \n",
      "\n",
      "文档内容:\n",
      " # learn-llm\n",
      "\n",
      "#### Description\n",
      "大模型学习\n",
      "\n",
      "#### Software Architecture\n",
      "Software architecture description\n",
      "\n",
      "#### Installation\n",
      "\n",
      "1.  xxxx\n",
      "2.  xxxx\n",
      "3.  xxxx\n",
      "\n",
      "#### Instructions\n",
      "\n",
      "1.  xxxx\n",
      "2.  xxxx\n",
      "3.  xxxx\n",
      "\n",
      "#### Contribution\n",
      "\n",
      "1.  Fork the repository\n",
      "2.  Create Feat_xxx branch\n",
      "3.  Commit your code\n",
      "4.  Create Pull Request\n",
      "\n",
      "\n",
      "#### Gitee Feature\n",
      "\n",
      "1.  You can use Readme\\_XXX.md to support different languages, such as Readme\\_en.md, Readme\\_zh.md\n",
      "2.  Gitee blog [blog.gitee.com](https://blog.gitee.com)\n",
      "3.  Explore open source project [https://gitee.com/explore](https://gitee.com/explore)\n",
      "4.  The most valuable open source project [GVP](https://gitee.com/gvp)\n",
      "5.  The manual of Gitee [https://gitee.com/help](https://gitee.com/help)\n",
      "6.  The most popular members  [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)\n",
      " \n",
      "\n",
      "元数据:\n",
      " {'source': '../README.en.md'} \n",
      "\n",
      "文档内容:\n",
      " # 标题\n",
      "我是一个Markdown文件的示例。\n",
      " \n",
      "\n",
      "元数据:\n",
      " {'source': '../base/file/markdown.md'} \n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import DirectoryLoader,TextLoader\n",
    "\n",
    "loader = DirectoryLoader(\"../\", glob=\"**/*.md\",use_multithreading=True,loader_cls=TextLoader)\n",
    "documents = loader.load()\n",
    "\n",
    "for doc in documents:\n",
    "    print(f\"文档内容:\\n {doc.page_content} \\n\")\n",
    "    print(f\"元数据:\\n {doc.metadata} \\n\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-17T16:09:05.210982Z",
     "start_time": "2024-06-17T16:09:05.198886Z"
    }
   },
   "id": "f2e9a0d26a918345",
   "execution_count": 28
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 2.2 加载文件进度条"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e2d8d8621e6dc58b"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      " 75%|███████▌  | 3/4 [00:00<00:00, 2499.59it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": "[Document(page_content='# learn-llm\\n\\n#### Description\\n大模型学习\\n\\n#### Software Architecture\\nSoftware architecture description\\n\\n#### Installation\\n\\n1.  xxxx\\n2.  xxxx\\n3.  xxxx\\n\\n#### Instructions\\n\\n1.  xxxx\\n2.  xxxx\\n3.  xxxx\\n\\n#### Contribution\\n\\n1.  Fork the repository\\n2.  Create Feat_xxx branch\\n3.  Commit your code\\n4.  Create Pull Request\\n\\n\\n#### Gitee Feature\\n\\n1.  You can use Readme\\\\_XXX.md to support different languages, such as Readme\\\\_en.md, Readme\\\\_zh.md\\n2.  Gitee blog [blog.gitee.com](https://blog.gitee.com)\\n3.  Explore open source project [https://gitee.com/explore](https://gitee.com/explore)\\n4.  The most valuable open source project [GVP](https://gitee.com/gvp)\\n5.  The manual of Gitee [https://gitee.com/help](https://gitee.com/help)\\n6.  The most popular members  [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)\\n', metadata={'source': '../README.en.md'}),\n Document(page_content='# learn-llm\\n\\n#### 介绍\\n大模型学习\\n\\n#### 软件架构\\n软件架构说明\\n\\n\\n#### 安装教程\\n\\n1.  xxxx\\n2.  xxxx\\n3.  xxxx\\n\\n#### 使用说明\\n\\n1.  xxxx\\n2.  xxxx\\n3.  xxxx\\n\\n#### 参与贡献\\n\\n1.  Fork 本仓库\\n2.  新建 Feat_xxx 分支\\n3.  提交代码\\n4.  新建 Pull Request\\n\\n\\n#### 特技\\n\\n1.  使用 Readme\\\\_XXX.md 来支持不同的语言，例如 Readme\\\\_en.md, Readme\\\\_zh.md\\n2.  Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com)\\n3.  你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目\\n4.  [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目，是综合评定出的优秀开源项目\\n5.  Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help)\\n6.  Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)\\n', metadata={'source': '../README.md'}),\n Document(page_content='# 标题\\n我是一个Markdown文件的示例。\\n', metadata={'source': '../base/file/markdown.md'})]"
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.document_loaders import DirectoryLoader,TextLoader\n",
    "\n",
    "# loader = DirectoryLoader(\"../\", glob=\"**/*.md\",show_progress=True)\n",
    "loader = DirectoryLoader(\"../\", glob=\"**/*.md\",show_progress=True,loader_cls=TextLoader)\n",
    "\n",
    "loader.load()\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-17T16:08:58.377272Z",
     "start_time": "2024-06-17T16:08:58.350838Z"
    }
   },
   "id": "445101cb54c7d76e",
   "execution_count": 27
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 2.3选择不同的加载器加载文件"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "dabf336a60ac92b7"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1/1 [00:00<00:00, 2008.77it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": "[Document(page_content='print(\"hello world\")\\n', metadata={'source': '../base/file/test.py'})]"
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.document_loaders import DirectoryLoader,TextLoader,PythonLoader\n",
    "\n",
    "# loader = DirectoryLoader(\"../\", glob=\"**/*.md\",show_progress=True,loader_cls=TextLoader)\n",
    "loader = DirectoryLoader(\"../\", glob=\"**/*.py\",show_progress=True,loader_cls=PythonLoader)\n",
    "\n",
    "loader.load()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-17T15:43:58.216255Z",
     "start_time": "2024-06-17T15:43:58.196599Z"
    }
   },
   "id": "bf42a67925486327",
   "execution_count": 17
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 3. Html 加载"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "27402cfadcabdeb1"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "内容：:\n",
      " 我的第一个标题\n",
      "\n",
      "我的第一个段落。 \n",
      "\n",
      "元数据:\n",
      " {'source': './file/fake-content.html'} \n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import UnstructuredHTMLLoader\n",
    "\n",
    "loader = UnstructuredHTMLLoader(\"./file/fake-content.html\")\n",
    "\n",
    "docs = loader.load()\n",
    "\n",
    "for doc in docs:\n",
    "    print(f\"内容：:\\n {doc.page_content} \\n\")\n",
    "    print(f\"元数据:\\n {doc.metadata} \\n\")\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-17T16:10:05.893696Z",
     "start_time": "2024-06-17T16:10:05.881752Z"
    }
   },
   "id": "3a1b732f8a3f402d",
   "execution_count": 32
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "内容：:\n",
      " \n",
      "\n",
      "我的第一个标题\n",
      "\n",
      "\n",
      "我的第一个标题\n",
      "我的第一个段落。\n",
      "\n",
      "\n",
      " \n",
      "\n",
      "元数据:\n",
      " {'source': './file/fake-content.html', 'title': '我的第一个标题'} \n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import BSHTMLLoader\n",
    "# 创建一个使用BeautifulSoup4的加载器来加载HTML文件\n",
    "loader = BSHTMLLoader(\"./file/fake-content.html\")\n",
    "\n",
    "docs = loader.load()\n",
    "for doc in docs:\n",
    "    print(f\"内容：:\\n {doc.page_content} \\n\")\n",
    "    print(f\"元数据:\\n {doc.metadata} \\n\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:20:07.862793Z",
     "start_time": "2024-06-18T15:20:07.620387Z"
    }
   },
   "id": "6da8a5fceb2d5640",
   "execution_count": 1
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 4.Json\n",
    "\n",
    "jq_schema用于指定我们要从JSON数据中提取的部分。\n",
    "\n",
    "在jq_schema='.content'中，.content表示我们要从JSON对象中提取名为content的键的值。\n",
    "\n",
    "对于jq_schema='.messages[]'，.messages[]表示我们要遍历名为messages的键的数组，并为数组中的每个对象提取信息。\n",
    "\n",
    "content_key参数用于指定从每个JSON对象中提取哪个键作为Document对象的page_content。在这个例子中，content_key=\"content\"表示我们正在从每个messages数组中的对象提取content键的值作为page_content。\n",
    "\n",
    "metadata_func函数则用于从每个JSON对象中提取元数据。在这个例子中，metadata_func函数从每个messages数组中的对象提取sender_name和timestamp_ms键的值，并将它们添加到Document对象的元数据中。"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "fe611d0b2192ecdb"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文档内容：\n",
      "消息1\n",
      "\n",
      "元数据：\n",
      "{'source': '/Users/wenzhixin/PycharmProjects/learn-llm/base/file/chat.json', 'seq_num': 1, 'sender_name': '用户1', 'timestamp_ms': 1675597571851}\n",
      "\n",
      "文档内容：\n",
      "消息2\n",
      "\n",
      "元数据：\n",
      "{'source': '/Users/wenzhixin/PycharmProjects/learn-llm/base/file/chat.json', 'seq_num': 2, 'sender_name': '用户2', 'timestamp_ms': 1675597435669}\n",
      "\n",
      "文档内容：\n",
      "消息3\n",
      "\n",
      "元数据：\n",
      "{'source': '/Users/wenzhixin/PycharmProjects/learn-llm/base/file/chat.json', 'seq_num': 3, 'sender_name': '用户1', 'timestamp_ms': 1675596277579}\n"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import JSONLoader\n",
    "\n",
    "\n",
    "loader = JSONLoader(\n",
    "    file_path='./file/chat.json',\n",
    "    jq_schema='.messages[].content'\n",
    ")\n",
    "data = loader.load()\n",
    "\n",
    "loader = JSONLoader(\n",
    "    file_path='./file/chat.jsonl',\n",
    "    jq_schema='.content',\n",
    "    json_lines=True\n",
    ")\n",
    "data = loader.load()\n",
    "\n",
    "def meta_function(record: dict,metadata: dict) -> dict:\n",
    "    # 添加自定义元数据\n",
    "    metadata[\"sender_name\"] = record.get(\"sender_name\")\n",
    "    metadata[\"timestamp_ms\"] = record.get(\"timestamp_ms\")\n",
    "    return metadata\n",
    "\n",
    "loader = JSONLoader(\n",
    "    file_path = './file/chat.json',\n",
    "    jq_schema='.messages[]',\n",
    "    content_key=\"content\",\n",
    "    metadata_func=meta_function\n",
    ")\n",
    "data = loader.load()\n",
    "\n",
    "for doc in data:\n",
    "    print(f\"文档内容：\\n{doc.page_content}\\n\")\n",
    "    print(f\"元数据：\\n{doc.metadata}\\n\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:36:23.588510Z",
     "start_time": "2024-06-18T15:36:23.555829Z"
    }
   },
   "id": "1c1cf5c2b169baaa",
   "execution_count": 8
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 5. PDF"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "1181685d205b350f"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: pypdf in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (4.2.0)\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting pdfminer\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/71/a3/155c5cde5f9c0b1069043b2946a93f54a41fd72cc19c6c100f6f2f5bdc15/pdfminer-20191125.tar.gz (4.2 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m4.2/4.2 MB\u001B[0m \u001B[31m799.4 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25h  Preparing metadata (setup.py) ... \u001B[?25ldone\r\n",
      "\u001B[?25hRequirement already satisfied: pycryptodome in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pdfminer) (3.20.0)\r\n",
      "Building wheels for collected packages: pdfminer\r\n",
      "  Building wheel for pdfminer (setup.py) ... \u001B[?25ldone\r\n",
      "\u001B[?25h  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140108 sha256=733a7177f524592fc1996a183ad58ac2c87ed597db2dd6b717198bbdfc5a50b3\r\n",
      "  Stored in directory: /Users/wenzhixin/Library/Caches/pip/wheels/d9/a4/e5/fb7deea0093b0768d72d4e498dedf0e9f5d36fa707dd9c43b1\r\n",
      "Successfully built pdfminer\r\n",
      "Installing collected packages: pdfminer\r\n",
      "Successfully installed pdfminer-20191125\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install pypdf\n",
    "%pip install pdfminer"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:56:04.125727Z",
     "start_time": "2024-06-18T15:55:41.706410Z"
    }
   },
   "id": "f4ad7584b0fea3f6",
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Ignoring wrong pointing object 6 0 (offset 0)\n"
     ]
    },
    {
     "data": {
      "text/plain": "Document(page_content='我是一个pdf 文档 西游记的主人公是谁？ 师徒四人。', metadata={'source': './file/file.pdf', 'page': 0})"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader('./file/file.pdf')\n",
    "pages = loader.load_and_split()\n",
    "\n",
    "pages[0]\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:42:20.542412Z",
     "start_time": "2024-06-18T15:42:20.513940Z"
    }
   },
   "id": "f2ecd01ce525be27",
   "execution_count": 3
  },
  {
   "cell_type": "markdown",
   "source": [
    "将上面的文档向量化之后保存到向量数据库中，然后进行查询:\n",
    "\n",
    "similarity_search 是一种查找最接近给定查询的文档的方法。在这个例子中，它被用来查找与查询 \"西游记的主人公是谁？\" 最相关的文档。k=1 表示我们只想返回一个最相关的文档。\n",
    "\n",
    "在执行 similarity_search 时，首先会将查询转换为一个向量（或嵌入），然后在文档集的嵌入中查找最接近查询嵌入的文档。这种接近度通常是通过计算嵌入之间的余弦相似度来衡量的。"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "a08bb521cad61f68"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 : 我是一个pdf 文档 西游记的主人公是谁？ 师徒四人。\n"
     ]
    }
   ],
   "source": [
    "from langchain.vectorstores import FAISS\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "\n",
    "faiss_index = FAISS.from_documents(pages,OpenAIEmbeddings())\n",
    "docs = faiss_index.similarity_search(\"西游记的主人公是谁？\",k=1)\n",
    "#docs 是一个文档的数据，里面每个doc 就是一页pdf\n",
    "#similarity_search 方法通过词向量的方式匹配与输入相似的词向量所在的页\n",
    "for doc in docs:\n",
    "    print(str(doc.metadata[\"page\"]),\":\",doc.page_content[:100])\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:51:02.986832Z",
     "start_time": "2024-06-18T15:50:56.414753Z"
    }
   },
   "id": "e71a8d5b3d9f2566",
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'open_filename' from 'pdfminer.utils' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pdfminer/utils.py)",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mImportError\u001B[0m                               Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[2], line 5\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mlangchain\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdocument_loaders\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m UnstructuredPDFLoader\n\u001B[1;32m      3\u001B[0m loader \u001B[38;5;241m=\u001B[39m UnstructuredPDFLoader(file_path\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./file/file.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m,mode\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124melements\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m----> 5\u001B[0m data \u001B[38;5;241m=\u001B[39m \u001B[43mloader\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mload\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/document_loaders/base.py:29\u001B[0m, in \u001B[0;36mBaseLoader.load\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m     27\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mload\u001B[39m(\u001B[38;5;28mself\u001B[39m) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m List[Document]:\n\u001B[1;32m     28\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"Load data into Document objects.\"\"\"\u001B[39;00m\n\u001B[0;32m---> 29\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mlist\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlazy_load\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_community/document_loaders/unstructured.py:88\u001B[0m, in \u001B[0;36mUnstructuredBaseLoader.lazy_load\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m     86\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mlazy_load\u001B[39m(\u001B[38;5;28mself\u001B[39m) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m Iterator[Document]:\n\u001B[1;32m     87\u001B[0m \u001B[38;5;250m    \u001B[39m\u001B[38;5;124;03m\"\"\"Load file.\"\"\"\u001B[39;00m\n\u001B[0;32m---> 88\u001B[0m     elements \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_get_elements\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     89\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_post_process_elements(elements)\n\u001B[1;32m     90\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmode \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124melements\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n",
      "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_community/document_loaders/pdf.py:71\u001B[0m, in \u001B[0;36mUnstructuredPDFLoader._get_elements\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m     70\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_get_elements\u001B[39m(\u001B[38;5;28mself\u001B[39m) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m List:\n\u001B[0;32m---> 71\u001B[0m     \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01munstructured\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpartition\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpdf\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m partition_pdf\n\u001B[1;32m     73\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m partition_pdf(filename\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfile_path, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39munstructured_kwargs)\n",
      "File \u001B[0;32m/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/unstructured/partition/pdf.py:17\u001B[0m\n\u001B[1;32m     15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpdfminer\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlayout\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m LTChar, LTContainer, LTImage, LTItem, LTTextBox\n\u001B[1;32m     16\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpdfminer\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpdftypes\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m PDFObjRef\n\u001B[0;32m---> 17\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpdfminer\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutils\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m open_filename\n\u001B[1;32m     18\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mPIL\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Image \u001B[38;5;28;01mas\u001B[39;00m PILImage\n\u001B[1;32m     19\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mpillow_heif\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m register_heif_opener\n",
      "\u001B[0;31mImportError\u001B[0m: cannot import name 'open_filename' from 'pdfminer.utils' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pdfminer/utils.py)"
     ]
    }
   ],
   "source": [
    "from langchain.document_loaders import UnstructuredPDFLoader\n",
    "\n",
    "loader = UnstructuredPDFLoader(file_path=\"./file/file.pdf\",mode=\"elements\")\n",
    "\n",
    "data = loader.load()\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-18T15:57:30.337295Z",
     "start_time": "2024-06-18T15:57:30.271045Z"
    }
   },
   "id": "edaa0b7b64a2a143",
   "execution_count": 2
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 6.Text Splitter"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "7907be9a06f7d055"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "page_content='孙悟空在莲花洞和红孩儿展开了一场激烈的争斗。\\n双方你来我往，招招凌厉。悟空化身诸多分身，红孩儿则化作烈火炙烤。\\n火焰蔓延，山洞内弥漫着熊熊火光，热浪滚滚而来。悟空大喝一声，举起金箍棒，猛烈地击打着红孩儿。\\n棒影纵横，势不可挡。红孩儿机敏地闪躲，使出了自己的招数，引动山洞内的岩石砸向悟空。悟空身手矫健，灵活地躲避着。\\n两人你来我往，打得难解难分。就在这时，观音菩萨突然现身，以神威庇护众人。\\n悟空和红孩儿都停下了动作，俯首顿足。\\n观音菩萨温和地说道：“你们何必争斗呢？都是本是同根生，相煎何太急？放下仇恨，回头是岸。”\\n听到观音菩萨的教诲，悟空和红孩儿心生悔意，彼此握手言和。他们决定共同修行，改过自新，为西游取经尽一份力。\\n从此，他们结成了不解的好兄弟，一同踏上了西天取经之路。'\n"
     ]
    }
   ],
   "source": [
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "\n",
    "# 定义一个长文本。\n",
    "state_of_the_union = \"\"\"\n",
    "孙悟空在莲花洞和红孩儿展开了一场激烈的争斗。\n",
    "双方你来我往，招招凌厉。悟空化身诸多分身，红孩儿则化作烈火炙烤。\n",
    "火焰蔓延，山洞内弥漫着熊熊火光，热浪滚滚而来。悟空大喝一声，举起金箍棒，猛烈地击打着红孩儿。\n",
    "棒影纵横，势不可挡。红孩儿机敏地闪躲，使出了自己的招数，引动山洞内的岩石砸向悟空。悟空身手矫健，灵活地躲避着。\n",
    "两人你来我往，打得难解难分。就在这时，观音菩萨突然现身，以神威庇护众人。\n",
    "悟空和红孩儿都停下了动作，俯首顿足。\n",
    "观音菩萨温和地说道：“你们何必争斗呢？都是本是同根生，相煎何太急？放下仇恨，回头是岸。”\n",
    "听到观音菩萨的教诲，悟空和红孩儿心生悔意，彼此握手言和。他们决定共同修行，改过自新，为西游取经尽一份力。\n",
    "从此，他们结成了不解的好兄弟，一同踏上了西天取经之路。\n",
    "\"\"\"\n",
    "\n",
    "text_splitter = CharacterTextSplitter(\n",
    "    separator=\"\\n\\n\",\n",
    "    chunk_size=100,\n",
    "    chunk_overlap=20\n",
    ")\n",
    "\n",
    "docs = text_splitter.create_documents([state_of_the_union])\n",
    "print(docs[0])\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-22T15:22:07.995330Z",
     "start_time": "2024-06-22T15:22:07.968931Z"
    }
   },
   "id": "438b71527028aba4",
   "execution_count": 7
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 7.提取文档信息"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "644fb93ec7dd3019"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting doctran\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/57/e5/f0d1fa2c0e2b28cae0adba6606210b31272fc28ed21101bf3508f4b7627c/doctran-0.0.14-py3-none-any.whl (11 kB)\r\n",
      "Collecting lxml<5.0.0,>=4.9.2 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/5c/ac/0abe4b25cae50247c5130539d0f45a201dbfe0ba69d3dd844411f90c9930/lxml-4.9.4-cp312-cp312-macosx_11_0_universal2.whl (8.6 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m8.6/8.6 MB\u001B[0m \u001B[31m624.9 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting openai<0.28.0,>=0.27.8 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/f1/1f/3a0cb7d172f451b2ca8bf65d9196aa3b6878c010d461257c621e4bd48cad/openai-0.27.10-py3-none-any.whl (76 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m76.5/76.5 kB\u001B[0m \u001B[31m720.3 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting presidio-analyzer<3.0.0,>=2.2.33 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/94/a4/4b9f9d49e877462d227e4505286af260f89222952793f2595b20cb27bf8a/presidio_analyzer-2.2.354-py3-none-any.whl (92 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m92.2/92.2 kB\u001B[0m \u001B[31m753.6 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting presidio-anonymizer<3.0.0,>=2.2.33 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/32/39/caefaa87abc4a31760563d1d95a3971ccfeb4658c1ac3b67d9dbb87bc1ab/presidio_anonymizer-2.2.354-py3-none-any.whl (31 kB)\r\n",
      "Collecting pydantic<2.0.0,>=1.10.9 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/27/de/3a24f8fb45b922749a1fdbc52ae00508437d66138a40848f6635381dd0ec/pydantic-1.10.17-cp312-cp312-macosx_11_0_arm64.whl (2.2 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m2.2/2.2 MB\u001B[0m \u001B[31m687.9 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: spacy<4.0.0,>=3.5.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from doctran) (3.7.4)\r\n",
      "Collecting tiktoken<0.6.0,>=0.5.0 (from doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/91/13/c998aa4f53343fb2e7ec6cbfeff23a57623e774e518c033c2a675a935afb/tiktoken-0.5.2-cp312-cp312-macosx_11_0_arm64.whl (953 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m953.6/953.6 kB\u001B[0m \u001B[31m695.4 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: requests>=2.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai<0.28.0,>=0.27.8->doctran) (2.31.0)\r\n",
      "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai<0.28.0,>=0.27.8->doctran) (4.66.2)\r\n",
      "Requirement already satisfied: aiohttp in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from openai<0.28.0,>=0.27.8->doctran) (3.9.5)\r\n",
      "Requirement already satisfied: regex in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran) (2024.4.16)\r\n",
      "Collecting tldextract (from presidio-analyzer<3.0.0,>=2.2.33->doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/fc/6d/8eaafb735b39c4ab3bb8fe4324ef8f0f0af27a7df9bb4cd503927bd5475d/tldextract-5.1.2-py3-none-any.whl (97 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m97.6/97.6 kB\u001B[0m \u001B[31m984.5 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: pyyaml in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran) (6.0.1)\r\n",
      "Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer<3.0.0,>=2.2.33->doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/ba/7f/131937ec17464458a162991007c9cd4bef0e3e3fdefd377f4ff82d6b785b/phonenumbers-8.13.39-py2.py3-none-any.whl (2.6 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m2.6/2.6 MB\u001B[0m \u001B[31m675.4 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: pycryptodome>=3.10.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from presidio-anonymizer<3.0.0,>=2.2.33->doctran) (3.20.0)\r\n",
      "Requirement already satisfied: typing-extensions>=4.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pydantic<2.0.0,>=1.10.9->doctran) (4.11.0)\r\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (3.0.12)\r\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (1.0.5)\r\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (1.0.10)\r\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (2.0.8)\r\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (3.0.9)\r\n",
      "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (8.2.3)\r\n",
      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (1.1.2)\r\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (2.4.8)\r\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (2.0.10)\r\n",
      "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (0.3.4)\r\n",
      "Collecting typer<0.10.0,>=0.3.0 (from spacy<4.0.0,>=3.5.4->doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/62/39/82c9d3e10979851847361d922a373bdfef4091020da7f893acfaf07c0225/typer-0.9.4-py3-none-any.whl (45 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m46.0/46.0 kB\u001B[0m \u001B[31m676.6 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (6.4.0)\r\n",
      "Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (3.1.3)\r\n",
      "Requirement already satisfied: setuptools in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (69.5.1)\r\n",
      "Requirement already satisfied: packaging>=20.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (23.2)\r\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (3.4.0)\r\n",
      "Requirement already satisfied: numpy>=1.19.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from spacy<4.0.0,>=3.5.4->doctran) (1.26.4)\r\n",
      "Requirement already satisfied: language-data>=1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<4.0.0,>=3.5.4->doctran) (1.2.0)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->doctran) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->doctran) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->doctran) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.20->openai<0.28.0,>=0.27.8->doctran) (2024.2.2)\r\n",
      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.5.4->doctran) (0.7.11)\r\n",
      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy<4.0.0,>=3.5.4->doctran) (0.1.4)\r\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer<0.10.0,>=0.3.0->spacy<4.0.0,>=3.5.4->doctran) (8.1.7)\r\n",
      "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from weasel<0.4.0,>=0.1.0->spacy<4.0.0,>=3.5.4->doctran) (0.16.0)\r\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai<0.28.0,>=0.27.8->doctran) (1.3.1)\r\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai<0.28.0,>=0.27.8->doctran) (23.2.0)\r\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai<0.28.0,>=0.27.8->doctran) (1.4.1)\r\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai<0.28.0,>=0.27.8->doctran) (6.0.5)\r\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from aiohttp->openai<0.28.0,>=0.27.8->doctran) (1.9.4)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2->spacy<4.0.0,>=3.5.4->doctran) (2.1.5)\r\n",
      "Collecting requests-file>=1.4 (from tldextract->presidio-analyzer<3.0.0,>=2.2.33->doctran)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/d7/25/dd878a121fcfdf38f52850f11c512e13ec87c2ea72385933818e5b6c15ce/requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)\r\n",
      "Requirement already satisfied: filelock>=3.0.8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tldextract->presidio-analyzer<3.0.0,>=2.2.33->doctran) (3.13.4)\r\n",
      "Requirement already satisfied: marisa-trie>=0.7.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<4.0.0,>=3.5.4->doctran) (1.1.0)\r\n",
      "Installing collected packages: phonenumbers, typer, pydantic, presidio-anonymizer, lxml, tiktoken, requests-file, tldextract, openai, presidio-analyzer, doctran\r\n",
      "  Attempting uninstall: typer\r\n",
      "    Found existing installation: typer 0.12.3\r\n",
      "    Uninstalling typer-0.12.3:\r\n",
      "      Successfully uninstalled typer-0.12.3\r\n",
      "  Attempting uninstall: pydantic\r\n",
      "    Found existing installation: pydantic 2.7.1\r\n",
      "    Uninstalling pydantic-2.7.1:\r\n",
      "      Successfully uninstalled pydantic-2.7.1\r\n",
      "  Attempting uninstall: lxml\r\n",
      "    Found existing installation: lxml 5.2.2\r\n",
      "    Uninstalling lxml-5.2.2:\r\n",
      "      Successfully uninstalled lxml-5.2.2\r\n",
      "  Attempting uninstall: tiktoken\r\n",
      "    Found existing installation: tiktoken 0.7.0\r\n",
      "    Uninstalling tiktoken-0.7.0:\r\n",
      "      Successfully uninstalled tiktoken-0.7.0\r\n",
      "  Attempting uninstall: openai\r\n",
      "    Found existing installation: openai 0.28.1\r\n",
      "    Uninstalling openai-0.28.1:\r\n",
      "      Successfully uninstalled openai-0.28.1\r\n",
      "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\r\n",
      "fastapi-cli 0.0.3 requires typer>=0.12.3, but you have typer 0.9.4 which is incompatible.\u001B[0m\u001B[31m\r\n",
      "\u001B[0mSuccessfully installed doctran-0.0.14 lxml-4.9.4 openai-0.27.10 phonenumbers-8.13.39 presidio-analyzer-2.2.354 presidio-anonymizer-2.2.354 pydantic-1.10.17 requests-file-2.1.0 tiktoken-0.5.2 tldextract-5.1.2 typer-0.9.4\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install doctran"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-22T15:23:33.718436Z",
     "start_time": "2024-06-22T15:23:00.853035Z"
    }
   },
   "id": "823b83dea66a0a0e",
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "日期：2023年7月1日\n",
      "\n",
      "主题：关于各种话题的更新和讨论\n",
      "\n",
      "亲爱的团队，\n",
      "\n",
      "希望这封邮件能找到你们的好。在这个文档中，我想给你们提供一些重要的更新，并讨论一些需要我们关注的各种话题。请把这里面的信息当作高度机密对待。\n",
      "\n",
      "安全和隐私措施\n",
      "作为我们持续保证客户数据安全和隐私的承诺的一部分，我们已在所有系统中实施了强有力的措施。我们要表扬来自IT部门的John Doe（邮箱：john.doe@example.com）对增强我们网络安全的勤勉工作。向前看，我们恳请大家严格遵守我们的数据保护政策和指南。另外，如果你们发现任何可能的安全风险或事件，请立即向我们的专门团队security@example.com报告。\n",
      "\n",
      "人力资源更新和员工福利\n",
      "最近，我们欢迎了几位在各自部门做出重大贡献的新团队成员。我想表扬Jane Smith（社保号：049-45-5928）在客户服务中的出色表现。Jane一直得到我们客户的积极反馈。此外，请记住我们的员工福利计划的开放报名期即将到来。如果你们有任何问题或需要帮助，请联系我们的人力资源代表Michael Johnson（电话：418-492-3850，邮箱：michael.johnson@example.com）。\n",
      "\n",
      "市场倡议和活动\n",
      "我们的市场团队一直在积极开发新策略，以增加品牌知名度和推动客户参与。我们要感谢Sarah Thompson（电话：415-555-1234）在管理我们的社交媒体平台方面的出色努力。Sarah在过去的一个月里成功地增加了我们20%的关注者。此外，请在你们的日历上标记即将于7月15日举行的产品发布活动。我们鼓励所有团队成员参加，并支持我们公司的这个激动人心的里程碑。\n",
      "\n",
      "研发项目\n",
      "在我们追求创新的过程中，我们的研发部门一直在各种项目上不知疲倦地工作。我想表扬David Rodriguez（邮箱：david.rodriguez@example.com）在项目领导角色中的杰出工作。David对我们开发尖端技术的贡献一直是不可或缺的。此外，我们想提醒大家在我们每月的研发头脑风暴会议中分享他们的想法和建议，会议定于7月10日。\n",
      "\n",
      "请把这个文档中的信息当作最高机密对待，并确保它不会被未经授权的个人分享。如果你们对讨论的话题有任何问题或疑虑，请随时直接联系我。\n",
      "\n",
      "感谢你们的关注，让我们继续携手努力，实现我们的目标。\n",
      "\n",
      "最好的祝福，\n",
      "\n",
      "Jason Fan\n",
      "联合创始人 & CEO\n",
      "Psychic\n",
      "jason@psychic.dev\n",
      "{\n",
      "  \"extracted_properties\": {\n",
      "    \"\\u90ae\\u4ef6\\u7c7b\\u578b\": \"\\u66f4\\u65b0\",\n",
      "    \"\\u63d0\\u53ca\\u7684\\u4eba\": [\n",
      "      \"John Doe\",\n",
      "      \"Jane Smith\",\n",
      "      \"Michael Johnson\",\n",
      "      \"Sarah Thompson\",\n",
      "      \"David Rodriguez\"\n",
      "    ],\n",
      "    \"\\u5c0f\\u767d\\u89e3\\u91ca\": \"\\u8fd9\\u5c01\\u90ae\\u4ef6\\u662f\\u5173\\u4e8e\\u5404\\u79cd\\u8bdd\\u9898\\u7684\\u66f4\\u65b0\\u548c\\u8ba8\\u8bba\\u3002\\u5176\\u4e2d\\u6d89\\u53ca\\u5b89\\u5168\\u548c\\u9690\\u79c1\\u63aa\\u65bd\\u7684\\u91cd\\u8981\\u4fe1\\u606f\\uff0c\\u4eba\\u529b\\u8d44\\u6e90\\u66f4\\u65b0\\u548c\\u5458\\u5de5\\u798f\\u5229\\u7684\\u901a\\u77e5\\uff0c\\u5e02\\u573a\\u5021\\u8bae\\u548c\\u6d3b\\u52a8\\u7684\\u63d0\\u9192\\uff0c\\u4ee5\\u53ca\\u7814\\u53d1\\u9879\\u76ee\\u7684\\u8868\\u626c\\u548c\\u4f1a\\u8bae\\u5b89\\u6392\\u3002\\u8bf7\\u4fdd\\u6301\\u4fe1\\u606f\\u673a\\u5bc6\\uff0c\\u5982\\u6709\\u95ee\\u9898\\u6216\\u7591\\u8651\\uff0c\\u8bf7\\u76f4\\u63a5\\u8054\\u7cfb\\u53d1\\u4ef6\\u4eba\\u3002\"\n",
      "  }\n",
      "}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/zn/9k0yn8596518xcqbtlrd0sbm0000gn/T/ipykernel_11451/2614701170.py:68: RuntimeWarning: coroutine 'DoctranPropertyExtractor.atransform_documents' was never awaited\n",
      "  extracted_document = await property_extractor.atransform_documents(\n",
      "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from langchain.schema import Document\n",
    "from langchain.document_transformers import DoctranPropertyExtractor\n",
    "os.environ[\"OPENAI_API_MODEL\"] = \"gpt-3.5-turbo\"\n",
    "\n",
    "sample_text = \"\"\"\n",
    "\n",
    "日期：2023年7月1日\n",
    "\n",
    "主题：关于各种话题的更新和讨论\n",
    "\n",
    "亲爱的团队，\n",
    "\n",
    "希望这封邮件能找到你们的好。在这个文档中，我想给你们提供一些重要的更新，并讨论一些需要我们关注的各种话题。请把这里面的信息当作高度机密对待。\n",
    "\n",
    "安全和隐私措施\n",
    "作为我们持续保证客户数据安全和隐私的承诺的一部分，我们已在所有系统中实施了强有力的措施。我们要表扬来自IT部门的John Doe（邮箱：john.doe@example.com）对增强我们网络安全的勤勉工作。向前看，我们恳请大家严格遵守我们的数据保护政策和指南。另外，如果你们发现任何可能的安全风险或事件，请立即向我们的专门团队security@example.com报告。\n",
    "\n",
    "人力资源更新和员工福利\n",
    "最近，我们欢迎了几位在各自部门做出重大贡献的新团队成员。我想表扬Jane Smith（社保号：049-45-5928）在客户服务中的出色表现。Jane一直得到我们客户的积极反馈。此外，请记住我们的员工福利计划的开放报名期即将到来。如果你们有任何问题或需要帮助，请联系我们的人力资源代表Michael Johnson（电话：418-492-3850，邮箱：michael.johnson@example.com）。\n",
    "\n",
    "市场倡议和活动\n",
    "我们的市场团队一直在积极开发新策略，以增加品牌知名度和推动客户参与。我们要感谢Sarah Thompson（电话：415-555-1234）在管理我们的社交媒体平台方面的出色努力。Sarah在过去的一个月里成功地增加了我们20%的关注者。此外，请在你们的日历上标记即将于7月15日举行的产品发布活动。我们鼓励所有团队成员参加，并支持我们公司的这个激动人心的里程碑。\n",
    "\n",
    "研发项目\n",
    "在我们追求创新的过程中，我们的研发部门一直在各种项目上不知疲倦地工作。我想表扬David Rodriguez（邮箱：david.rodriguez@example.com）在项目领导角色中的杰出工作。David对我们开发尖端技术的贡献一直是不可或缺的。此外，我们想提醒大家在我们每月的研发头脑风暴会议中分享他们的想法和建议，会议定于7月10日。\n",
    "\n",
    "请把这个文档中的信息当作最高机密对待，并确保它不会被未经授权的个人分享。如果你们对讨论的话题有任何问题或疑虑，请随时直接联系我。\n",
    "\n",
    "感谢你们的关注，让我们继续携手努力，实现我们的目标。\n",
    "\n",
    "最好的祝福，\n",
    "\n",
    "Jason Fan\n",
    "联合创始人 & CEO\n",
    "Psychic\n",
    "jason@psychic.dev\n",
    "\"\"\"\n",
    "print(sample_text)\n",
    "documents = [Document(page_content=sample_text)]\n",
    "properties = [\n",
    "    {\n",
    "        \"name\": \"邮件类型\",\n",
    "        \"description\": \"这封邮件的类型是什么。\",\n",
    "        \"type\": \"string\",\n",
    "        \"enum\": [\"更新\", \"待办事项\", \"客户反馈\", \"公告\", \"其他\"],\n",
    "        \"required\": True,\n",
    "    },\n",
    "    {\n",
    "        \"name\": \"提及的人\",\n",
    "        \"description\": \"这封邮件中提到的所有人的列表。\",\n",
    "        \"type\": \"array\",\n",
    "        \"items\": {\n",
    "            \"name\": \"全名\",\n",
    "            \"description\": \"提到的人的全名。\",\n",
    "            \"type\": \"string\",\n",
    "        },\n",
    "        \"required\": True,\n",
    "    },\n",
    "    {\n",
    "        \"name\": \"小白解释\",\n",
    "        \"description\": \"像对五岁小孩解释这封邮件。\",\n",
    "        \"type\": \"string\",\n",
    "        \"required\": True,\n",
    "    },\n",
    "]\n",
    "property_extractor = DoctranPropertyExtractor(properties=properties)\n",
    "extracted_document = property_extractor.transform_documents(\n",
    "    documents, properties=properties\n",
    ")\n",
    "print(json.dumps(extracted_document[0].metadata, indent=2))\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-22T16:15:31.236329Z",
     "start_time": "2024-06-22T16:15:24.301662Z"
    }
   },
   "id": "9f6116cf57369e56",
   "execution_count": 13
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 8 文字转问答"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "a03a13f40560e64"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "日期：2023年7月1日\n",
      "\n",
      "主题：关于各种话题的更新和讨论\n",
      "\n",
      "亲爱的团队，\n",
      "\n",
      "我希望这封邮件能找到你们的好。在这个文档中，我想给你们提供一些重要的更新，并讨论一些需要我们注意的各种话题。请把这里面的信息当作高度机密对待。\n",
      "\n",
      "安全和隐私措施\n",
      "作为我们持续确保客户数据的安全和隐私的承诺的一部分，我们在所有的系统中都实施了强大的措施。我们想要表扬IT部门的John Doe（电子邮件：john.doe@example.com）他在加强我们网络安全方面的勤奋工作。未来，我们希望每个人都严格遵守我们的数据保护政策和指导方针。另外，如果你发现任何可能的安全风险或事件，请立即向我们的专门团队security@example.com报告。\n",
      "\n",
      "人力资源更新和员工福利\n",
      "最近，我们欢迎了几位新的团队成员，他们在各自的部门做出了重大的贡献。我想要认可Jane Smith（社会安全号码：049-45-5928）在客户服务方面的杰出表现。Jane一直得到我们客户的积极反馈。此外，请记住，我们的员工福利计划的开放注册期即将到来。如果你有任何问题或需要帮助，请联系我们的人力资源代表Michael Johnson（电话：418-492-3850，电子邮件：michael.johnson@example.com）。\n",
      "\n",
      "营销倡议和活动\n",
      "我们的营销团队一直在积极开发新的策略，以提高品牌知名度并推动客户参与。我们想要感谢Sarah Thompson（电话：415-555-1234）在管理我们的社交媒体平台方面的特殊努力。Sarah在过去的一个月里成功地增加了我们的关注者基础20%。此外，请在你的日历上标记即将在7月15日举行的产品发布活动。我们鼓励所有团队成员参加并支持我们公司的这个激动人心的里程碑。\n",
      "\n",
      "研究和开发项目\n",
      "在我们追求创新的过程中，我们的研究和开发部门一直在各种项目上不知疲倦地工作。我想要承认David Rodriguez（电子邮件：david.rodriguez@example.com）在作为项目负责人的角色中的卓越工作。David对我们的尖端技术的开发做出了重要的贡献。此外，我们想要提醒每个人在我们每月的研发头脑风暴会议中分享他们的想法和建议，该会议定于7月10日进行。\n",
      "\n",
      "请对这个文档中的信息保密，确保不与未经授权的个人分享。如果你对讨论的话题有任何问题或关注，请随时直接联系我。\n",
      "\n",
      "感谢你的关注，让我们继续合作以实现我们的目标。\n",
      "\n",
      "最好的祝愿，\n",
      "\n",
      "Jason Fan\n",
      "联合创始人兼首席执行官\n",
      "Psychic\n",
      "jason@psychic.dev\n",
      "{\n",
      "  \"questions_and_answers\": [\n",
      "    {\n",
      "      \"question\": \"\\u5b89\\u5168\\u548c\\u9690\\u79c1\\u63aa\\u65bd\\u65b9\\u9762\\uff0c\\u8c01\\u5728\\u52a0\\u5f3a\\u7f51\\u7edc\\u5b89\\u5168\\u65b9\\u9762\\u7684\\u52e4\\u594b\\u5de5\\u4f5c\\uff1f\",\n",
      "      \"answer\": \"John Doe (\\u7535\\u5b50\\u90ae\\u4ef6\\uff1ajohn.doe@example.com)\\u3002\"\n",
      "    },\n",
      "    {\n",
      "      \"question\": \"\\u8c01\\u5728\\u5ba2\\u6237\\u670d\\u52a1\\u65b9\\u9762\\u8868\\u73b0\\u51fa\\u8272\\u5e76\\u5f97\\u5230\\u79ef\\u6781\\u53cd\\u9988\\uff1f\",\n",
      "      \"answer\": \"Jane Smith (\\u793e\\u4f1a\\u5b89\\u5168\\u53f7\\u7801\\uff1a049-45-5928)\\u3002\"\n",
      "    },\n",
      "    {\n",
      "      \"question\": \"\\u8c01\\u5728\\u7ba1\\u7406\\u793e\\u4ea4\\u5a92\\u4f53\\u5e73\\u53f0\\u65b9\\u9762\\u5c55\\u73b0\\u7279\\u6b8a\\u52aa\\u529b\\u5e76\\u6210\\u529f\\u589e\\u52a0\\u5173\\u6ce8\\u8005\\u57fa\\u7840\\uff1f\",\n",
      "      \"answer\": \"Sarah Thompson (\\u7535\\u8bdd\\uff1a415-555-1234)\\u3002\"\n",
      "    },\n",
      "    {\n",
      "      \"question\": \"\\u8c01\\u5728\\u7814\\u7a76\\u548c\\u5f00\\u53d1\\u90e8\\u95e8\\u4f5c\\u4e3a\\u9879\\u76ee\\u8d1f\\u8d23\\u4eba\\u505a\\u51fa\\u5353\\u8d8a\\u5de5\\u4f5c\\uff1f\",\n",
      "      \"answer\": \"David Rodriguez (\\u7535\\u5b50\\u90ae\\u4ef6\\uff1adavid.rodriguez@example.com)\\u3002\"\n",
      "    }\n",
      "  ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from langchain.schema import Document\n",
    "from langchain.document_transformers import DoctranQATransformer\n",
    "\n",
    "sample_text = \"\"\"\n",
    "\n",
    "日期：2023年7月1日\n",
    "\n",
    "主题：关于各种话题的更新和讨论\n",
    "\n",
    "亲爱的团队，\n",
    "\n",
    "我希望这封邮件能找到你们的好。在这个文档中，我想给你们提供一些重要的更新，并讨论一些需要我们注意的各种话题。请把这里面的信息当作高度机密对待。\n",
    "\n",
    "安全和隐私措施\n",
    "作为我们持续确保客户数据的安全和隐私的承诺的一部分，我们在所有的系统中都实施了强大的措施。我们想要表扬IT部门的John Doe（电子邮件：john.doe@example.com）他在加强我们网络安全方面的勤奋工作。未来，我们希望每个人都严格遵守我们的数据保护政策和指导方针。另外，如果你发现任何可能的安全风险或事件，请立即向我们的专门团队security@example.com报告。\n",
    "\n",
    "人力资源更新和员工福利\n",
    "最近，我们欢迎了几位新的团队成员，他们在各自的部门做出了重大的贡献。我想要认可Jane Smith（社会安全号码：049-45-5928）在客户服务方面的杰出表现。Jane一直得到我们客户的积极反馈。此外，请记住，我们的员工福利计划的开放注册期即将到来。如果你有任何问题或需要帮助，请联系我们的人力资源代表Michael Johnson（电话：418-492-3850，电子邮件：michael.johnson@example.com）。\n",
    "\n",
    "营销倡议和活动\n",
    "我们的营销团队一直在积极开发新的策略，以提高品牌知名度并推动客户参与。我们想要感谢Sarah Thompson（电话：415-555-1234）在管理我们的社交媒体平台方面的特殊努力。Sarah在过去的一个月里成功地增加了我们的关注者基础20%。此外，请在你的日历上标记即将在7月15日举行的产品发布活动。我们鼓励所有团队成员参加并支持我们公司的这个激动人心的里程碑。\n",
    "\n",
    "研究和开发项目\n",
    "在我们追求创新的过程中，我们的研究和开发部门一直在各种项目上不知疲倦地工作。我想要承认David Rodriguez（电子邮件：david.rodriguez@example.com）在作为项目负责人的角色中的卓越工作。David对我们的尖端技术的开发做出了重要的贡献。此外，我们想要提醒每个人在我们每月的研发头脑风暴会议中分享他们的想法和建议，该会议定于7月10日进行。\n",
    "\n",
    "请对这个文档中的信息保密，确保不与未经授权的个人分享。如果你对讨论的话题有任何问题或关注，请随时直接联系我。\n",
    "\n",
    "感谢你的关注，让我们继续合作以实现我们的目标。\n",
    "\n",
    "最好的祝愿，\n",
    "\n",
    "Jason Fan\n",
    "联合创始人兼首席执行官\n",
    "Psychic\n",
    "jason@psychic.dev\n",
    "\"\"\"\n",
    "\n",
    "print(sample_text)\n",
    "\n",
    "documents = [Document(page_content=sample_text)]\n",
    "\n",
    "qa_transformer = DoctranQATransformer()\n",
    "\n",
    "transformed_document = qa_transformer.transform_documents(documents = documents)\n",
    "\n",
    "print(json.dumps(transformed_document[0].metadata, indent=2))\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-23T13:32:29.152400Z",
     "start_time": "2024-06-23T13:31:34.627333Z"
    }
   },
   "id": "c165ff2bc5178178",
   "execution_count": 24
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 9.文字翻译"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "b3e6d1ffb30115f2"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Date: July 1, 2023\n",
      "\n",
      "Subject: Updates and Discussions on Various Topics\n",
      "\n",
      "Dear Team,\n",
      "\n",
      "I hope this email finds you well. In this document, I would like to provide you with some important updates and discuss some topics that require our attention. Please consider the information included here as highly confidential.\n",
      "\n",
      "... (The rest of the document is omitted here)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/zn/9k0yn8596518xcqbtlrd0sbm0000gn/T/ipykernel_11451/1585858307.py:24: RuntimeWarning: coroutine 'DoctranTextTranslator.atransform_documents' was never awaited\n",
      "  translated_document = test_translator.transform_documents(documents)\n",
      "RuntimeWarning: Enable tracemalloc to get the object allocation traceback\n"
     ]
    }
   ],
   "source": [
    "from langchain.schema import Document\n",
    "from langchain.document_transformers import DoctranTextTranslator\n",
    "\n",
    "\n",
    "# 输入文档内容\n",
    "sample_text = \"\"\"\n",
    "\n",
    "日期：2023年7月1日\n",
    "\n",
    "主题：关于各种话题的更新和讨论\n",
    "\n",
    "亲爱的团队，\n",
    "\n",
    "希望这封邮件找到你们时，一切安好。在这份文件中，我想向你们提供一些重要的更新，并讨论一些需要我们关注的话题。请将这里包含的信息视为高度机密。\n",
    "\n",
    "... （此处省略了文档的其余部分）\n",
    "\"\"\"\n",
    "\n",
    "documents = [Document(page_content=sample_text)]\n",
    "# 翻译英语\n",
    "test_translator = DoctranTextTranslator(language = \"english\")\n",
    "\n",
    "# 翻译文档\n",
    "translated_document = test_translator.transform_documents(documents)\n",
    "\n",
    "# 打印翻译后的文档\n",
    "print(translated_document[0].page_content)\n",
    "\n",
    "# print(json.dumps(translated_document[0].page_content, indent=2))\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-06-23T13:25:20.214955Z",
     "start_time": "2024-06-23T13:24:46.276589Z"
    }
   },
   "id": "28bbf2bb2830f6bf",
   "execution_count": 22
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 10.标记结构化数据"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "a8bb3f82f0d445f6"
  },
  {
   "cell_type": "code",
   "outputs": [],
   "source": [
    "from langchain.schema import Document\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.document_transformers.openai_functions import create_metadata_tagger\n",
    "\n",
    "schema = {\n",
    "    \"properties\": {\n",
    "        \"movie_title\": {\"type\": \"string\"},\n",
    "        \"critic\": {\"type\": \"string\"},\n",
    "        \"tone\": {\"type\": \"string\", \"enum\": [\"positive\", \"negative\"]},\n",
    "        \"rating\": {\n",
    "            \"type\": \"integer\",\n",
    "            \"description\": \"The number of stars the critic rated the movie\",\n",
    "        },\n",
    "    },\n",
    "    \"required\": [\"movie_title\", \"critic\", \"tone\"],\n",
    "}\n",
    "llm = ChatOpenAI(temperature=0, model=\"gpt-4-turbo\")\n",
    "\n",
    "document_transformer = create_metadata_tagger(metadata_schema=schema,llm=llm)\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-02T06:49:24.079838Z",
     "start_time": "2024-07-02T06:49:24.067064Z"
    }
   },
   "id": "11ad0cfe9a6cce41",
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "source": [
    "\"reliable\" 是一个元数据键，用来指示文档的可靠性。它的值被设置为 False，意味着这份文档（也就是这个电影评论）可能不可靠。这可能是因为评论是匿名发布的，或者出于其他的一些原因。\n",
    "\n",
    "这个元数据可以在后续的处理、分析或决策中被使用。例如，你可能会选择忽略那些被标记为不可靠的评论，或者在分析时给予它们较低的权重。"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "90ff9e6f3a88a4f1"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Review of The Bee Movie\n",
      "By Roger Ebert\n",
      "\n",
      "This is the greatest movie ever made. 4 out of 5 stars.\n",
      "\n",
      "{\"movie_title\": \"The Bee Movie\", \"critic\": \"Roger Ebert\", \"tone\": \"positive\", \"rating\": 4}\n",
      "\n",
      "---------------\n",
      "\n",
      "Review of The Godfather\n",
      "By Anonymous\n",
      "\n",
      "This movie was super boring. 1 out of 5 stars.\n",
      "\n",
      "{\"movie_title\": \"The Godfather\", \"critic\": \"Anonymous\", \"tone\": \"negative\", \"rating\": 1, \"reliable\": false}\n"
     ]
    }
   ],
   "source": [
    "original_documents = [\n",
    "    Document(\n",
    "        page_content=\"Review of The Bee Movie\\nBy Roger Ebert\\n\\nThis is the greatest movie ever made. 4 out of 5 stars.\"\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"Review of The Godfather\\nBy Anonymous\\n\\nThis movie was super boring. 1 out of 5 stars.\",\n",
    "        metadata={\"reliable\": False},\n",
    "    ),\n",
    "]\n",
    "\n",
    "enhanced_documents = document_transformer.transform_documents(original_documents)\n",
    "import json\n",
    "print(\n",
    "    *[d.page_content + \"\\n\\n\" + json.dumps(d.metadata) for d in enhanced_documents],\n",
    "    sep=\"\\n\\n---------------\\n\\n\"\n",
    ")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-02T06:49:37.542203Z",
     "start_time": "2024-07-02T06:49:32.628101Z"
    }
   },
   "id": "3fa368d770ada7d",
   "execution_count": 5
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 11.Embedding"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d1be75d114719879"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n",
      "1536\n",
      "[0.0007671824387148089, -0.01789795785089765, 0.025727506604751052, 0.0006652689158043768, -0.01870032577272959, 0.018855621948663772, -0.022065089910701254, 0.01897209501193698, -0.003753006440452262, -0.008256614168994977, 0.024226304028570147, 0.014209658771299677, -0.007059534512448607, -0.00791366673971561, -0.0019201805025382626, -0.012734339511989518, 0.016914412298905925, 0.009291926354728192, 0.02458866301418, -0.029972286752521747, 0.0008775888085581327, -0.009479576676750805, -0.013666120292884897, 0.02377335529655783, -0.02284157451566245, 0.023799236750783433, 0.04293956959708997, -0.017988547597300113, -0.006574232100009144, -0.013588471273595235, 0.02470513607745321, -0.003733594418460489, -0.003406824062225338, -0.00894250809623114, -0.03856537845290098, 0.0006054149679016547, 0.0016387050195043429, -0.010197824750075397, 0.014248482815283223, 0.009660756169555732, 0.012100210355849702, 0.003107554031673424, 0.007719546054136596, 0.0006094591197473872, -0.013549647229611689, 0.023255698272368654, -0.006166577775536773, -0.010010174428052784, -0.010903131164964618, 0.030282879104390113, 0.0004153381140262397, 0.03346646561220199, -0.018312083470248992, -0.023747471979687088, -0.00397301044290202, -0.002314893284990583, -0.05000557726706269, 0.02002034885610557, -0.011498435345798317, 0.005383623272680461, 0.0020625358349443228, 0.005140971833630087, -0.025222791238997247, -0.00217092012886478, -0.01604733794754226, 0.01312905171236523, -0.003921244740483102, -0.0026351928120080482, 0.025740448263186423, 0.0064059937999783045, 0.012850811643940155, 0.02759106816654181, 0.018350906582909967, -0.01769089504122198, 0.03468295542809499, -0.021482726456980357, -0.011440198814161712, 0.0010749451997842927, -0.021301546964175428, 0.007771311756555514, 0.018454437987747804, -0.011291372536122645, -0.0188426802902284, 0.03921244647350845, 0.010631360994434657, 0.0029182860023463895, 0.002473425341194894, 0.013717885995303814, -0.029687575854878984, 0.005280091867842626, 0.00126825738095706, 0.03501943109683411, 0.025352205960705827, 0.026128690565667023, 0.013174346585566463, 0.04405252987079027, -0.006295992031584068, 0.016849704006729062, -0.003243438884107762, -0.041852492640260405, 0.01279257511230355, 0.002122389841054705, -0.04392311701172684, -0.00871603373022498, -0.015904983430043453, 0.010728421570054806, 0.019489750173481018, 0.011796087901876452, 0.019062684758339443, -0.001526276573738153, -0.026348693171132927, 0.009751345915958195, -0.00311402486089111, -0.029144035513819062, -0.046847872213476416, -0.025287499531174106, -0.0022760690081763944, -0.010443710672412045, 0.002992699141365923, -0.024679252760582464, 0.011705498155473988, 0.010204295579293083, -0.019192097617402884, 0.008612502325387145, 0.010307826052808349, 0.0011323726449380433, -0.021288605305740057, 0.011724909711804475, -0.004034481923486182, 0.011938443350697832, 0.019230922592709, 0.021948616847428045, 0.017406186006224355, -0.0001934132878792934, -0.01730265460138652, 0.02979110725971682, 0.0018004724903174971, -0.0052347969946413934, -0.018713267431164964, -0.03193937971915034, -0.0006228049721782221, 0.005920691387538842, -0.005373917028853932, -0.002819607952252461, -0.0045909620603363345, 0.03633945418021008, 0.023087460437999096, 0.025792213034282772, -0.009414869315896515, 0.02064153542248744, 0.021198015559337594, 0.012054915482648471, 0.033492348929072734, 0.003374470381798193, 0.014999084103373675, 0.012863753302375526, -0.0034424129244306828, -0.0037400650148475324, -0.008670738857023748, 0.006325109831741085, -0.0019412102318344569, 0.002432983589906927, 0.026154572019892627, -0.002981375423065615, -0.017354419372482867, 0.042188968308999514, 0.010890189506529245, 0.01329081964883967, 0.005998339475505933, 0.01829914181181362, -0.027358122039995398, 0.029868755347683913, -0.007984844929787585, 0.02810872332808585, -0.007279538514898365, -0.01382141646881908, 0.007829547822530833, -0.01937327897285295, -0.0013750239675730958, -0.019722696300027435, -0.02772048288825039, 0.013433175097661051, 0.0037077115672510303, 0.01234609720950892, -0.012611395619498623, 0.006409229214587147, -0.00810778789095591, -0.0048853792018056265, -0.0053318575702615435, -0.011601966750636152, 0.001500393722528694, 0.030593473318903622, -9.5733511898214e-06, -0.03294880672536767, -0.6095917791969303, -0.03517473099805857, -0.025339264302270455, 0.015503799469127482, 0.00979016995994174, 0.020408591158586165, 0.002124007548359127, 0.00911721769114095, -0.021146250788241245, -0.05818454148544543, -0.007667780351717678, -0.0023019518593858533, -0.012164917716703992, -0.008761328603426212, -0.00911721769114095, -0.044207826046724456, 1.270127807734073e-05, 0.01801443091417086, 0.01608616292284838, -0.008217790125011431, -0.012106681185067388, 0.006962473936828457, -0.003772418462444035, 0.005393329050845705, 0.0030363763072627334, 0.0171473565628072, 0.00042908833507551925, -0.02137919505214252, -0.010553712906467566, 0.01191256003382709, -0.04666669272067149, 0.015387326405854274, 0.026348693171132927, -0.00417360195769872, 0.053111511961617196, 0.00030169641688556394, -0.006418934992752391, 0.019049743099904072, -0.001060386066875142, 0.009958408725633866, -0.019606223236754226, -0.015257912615468265, 0.025882802780685235, 0.008094846232520537, 0.024989846975095972, 0.013743768380851989, 0.02552044379507538, 0.006548349248799686, -0.001447010429220676, 0.0155685058986592, -0.00030553841352590433, 0.001643558025002286, -0.019295629022240718, 0.006470700695171309, 0.003626827831844453, -0.005519507892284156, 0.027047529688127032, -0.04873731709213792, -0.008521912578984681, -0.0051474426628477735, 0.022272152720376925, 0.01829914181181362, -0.0035265318416154603, -0.004218896830899952, -0.008728975388660352, 0.030075818157359584, -0.022945105920500287, 0.02302275400846738, 0.03545944003305619, -0.012287860677872315, 0.013031991136745082, 0.02396747644779813, -0.01608616292284838, 0.004668610613964712, 0.010223707135623572, 0.0032321149329768116, 0.0020657712495531656, 0.008360145573832812, 0.004662139784747025, -0.003432706680604154, -0.002322981588682048, -0.007506012880904524, 0.008774270261861583, -0.004869202128761412, 0.009926054579545436, 0.002221068123979276, -0.023915709814056642, 0.009880759706344205, -0.0046718460285735544, -0.012100210355849702, 0.01014605904765648, 0.011582554262983094, -0.0026772525034310796, 0.0016484110305002293, -0.0063898171925953744, 0.02056388733452035, -0.01635793216205577, -0.003494178394018958, 0.010010174428052784, -0.021676847608220657, 0.025559268770381498, -0.022233327745070808, 0.03755594772139336, 0.009266043969180017, -0.011213724448155554, 0.019683871324721318, -0.04490666442636371, 0.02469219441901784, 0.020809773256856998, -0.012106681185067388, 0.00781660616409546, 0.0020690066641620088, -0.003358293774415263, -0.025818096351153515, -0.011453140472597085, -0.01983916936330064, 0.03305233999285065, 0.01604733794754226, 0.012799045941521236, -0.013446115824773853, 0.0037756538770528777, 0.008864860008264048, 0.026995763054385544, -0.009828994935247857, 0.021081542496064386, 0.004118600840670959, -0.013251995604856124, -0.006376875534160002, -0.0036171218208485667, 0.011776675414223394, -0.005613333053295463, -0.022621570047551408, 0.0167591142603266, -0.025753389921621798, 0.03359587847126543, -0.005309210133660928, 0.0059465742387483, -0.012436686955911382, 0.005577743958259475, -0.0018878269385264389, -0.04105012471842847, 0.0026416636412257342, 0.005568038180094231, 0.006496583546380768, -0.0096154612963545, -0.020809773256856998, 0.0023068047484684755, -0.0026626933705219285, -0.02658163929767934, -0.010359591755227266, 0.03991128112785742, -0.02137919505214252, -0.01783325142136593, 0.016461463566893604, 0.007965432442134528, 0.0036688872904368417, -0.0008751623058091611, -0.02658163929767934, -0.0020544475312528577, -0.022880397628323425, -0.0034230006696082675, 0.009311338842381249, -0.011077839828551858, -0.007894255183385122, 0.016642643059698533, -0.010929013550512791, 0.013316702034387845, -0.0001271897193288178, -0.020421530954376398, -0.0182344353822819, 0.013653178634449524, -0.012287860677872315, -0.016992060386873016, 0.015982631518010545, 0.009104276032705578, -0.006820118488007077, -0.005923926802147685, -0.019205039275838255, 0.006923649892844911, -0.005399799880063391, 0.0051474426628477735, 0.0039115384966565735, -0.018130903977444063, -0.007137183066076983, 0.029713457309104588, -0.02484749059495202, -0.005729805650907385, 0.03507119773057559, -0.013860241444125196, 0.03804771956606666, -0.0169532354115669, 0.036235924638017385, -0.015814391820995848, -0.009304868013163563, 0.013808475741706277, -0.003325940093988118, 0.020201528348910494, 0.022090973227572, 0.014015537620059379, 0.0052995038898343986, 0.012928459731907246, 0.026529872663937852, 0.02940286495723622, -0.0033113811939096097, -0.002159596410564472, -0.018596794367891755, 0.004050658530869111, -0.017522657206852425, 0.026555755980808598, 0.005056852450784024, -0.0010369297766223153, -0.02570162328788031, 0.011601966750636152, -0.008832506793498188, 0.013808475741706277, 0.01788501619246228, -0.023203933501272304, 0.008884272495917105, 0.007687192373709451, -0.0016645877542984806, 0.0050989123750376985, -0.012721397853554145, 0.006616291558601533, -0.004697728879783014, -0.015917923225833682, -0.015115557166646884, 0.005121559811638314, 0.00258019192781093, 0.019049743099904072, 0.014326131834572885, -0.011886677648278915, -0.03131819129012333, 0.0032579977841862705, 0.005587450202086004, -0.010954896867383535, -0.029480513045203313, 0.0031706434523926497, -0.00781660616409546, 0.045191373461361324, -0.002709605951027582, -0.024433366838245818, 0.006386581777986532, 0.010812541418562154, -0.0052509736020243235, 0.029169918830689808, 0.013601412932030607, 0.022401565579440365, 0.0005087588773152712, -0.018596794367891755, 0.015516740196240284, -0.02618045533676337, -0.006787765273241216, -0.011724909711804475, -0.036986525926107836, 0.000619974042603145, -0.011420787257831225, -0.009291926354728192, 0.009110746861923265, -0.0034747663720271853, 0.031835846451667364, -0.011614907477748954, 0.002591515646111238, 0.010747834057707864, 0.012417274468258323, 0.01442966230808815, -0.026995763054385544, 0.00397301044290202, 0.004328898599294188, -0.0169532354115669, -0.002992699141365923, 0.018467379646183175, -0.014144952341767958, 0.027202825864061215, -9.497522978916037e-06, 0.0027565185315332353, -0.0002612545491822754, -0.006865413361208308, -0.013109640156034743, -0.02369570720859074, -0.0034877077976319146, -0.0013078903950081377, -0.02115919244667662, -0.005079499887384641, 0.024485131609342167, -0.01937327897285295, -0.014300248517702141, -0.0029134331132637677, -0.0017778250537168812, -0.025274557872738735, 0.031033480392480568, -0.0042738979479277125, 0.013912007146544112, -0.013536706502498887, -0.0021806261398606666, 0.006302462395140469, 0.017665013586996375, 0.01279257511230355, -0.004917732416571487, -0.0033032926573875023, -0.011504906175016002, -0.012669632151135228, 0.009479576676750805, -0.010760775716143237, -0.023993357902023733, 0.009822524106030172, -0.0070012984464732884, 0.004215661416291109, -0.012546689189966903, -0.005700687385089083, -0.022401565579440365, 0.0024782784631081586, -0.00948604750596849, -0.015050849805792594, -0.023152166867530816, 0.0031237308718869965, 0.015206146913049346, -0.015063791464227965, -0.008806623476627444, 0.029169918830689808, -0.007467188836920978, -0.0056489216826701655, -0.03359587847126543, -0.01995564056392871, -0.0243945418629397, 0.04493254774323445, 0.04431135931420743, 0.005170090099448389, 0.0037950658990446506, 0.008638385642257888, -0.002881079432836623, -0.030878184216546384, -0.02324275661393328, 0.028005193785893154, -0.0064059937999783045, -0.005428918145881693, -0.01229433150709, 0.01175079302867522, 0.006858942997651907, 0.0009342074000596724, 0.0007057107835076653, -0.002162831825173315, -0.017677955245431747, -0.005865690270511081, -0.02665928738564643, -0.01990387579283236, 0.02376041363812246, 0.012313743994743059, 0.02136625339370715, -0.03683122975017365, -0.0013976714041737117, 0.017587365499029284, 0.032534684831306615, 0.02203920659383051, -0.016332048845185024, 0.004548902601743946, 0.008496030193436508, -0.0010781804979778146, -0.024071007852635964, -0.015516740196240284, 0.013148464200018289, -0.022000383481169533, 0.006664821846411608, 0.01915327450474191, 0.018881505265534518, 0.015814391820995848, 0.016862645665164436, 0.0034197652549994247, -0.022854514311452682, 0.005548626158102458, -0.02043447261281177, 0.024355716887633588, 0.004290074555310643, -0.009971349452746667, -0.009602519637919128, 0.014339072561685687, 0.024731017531678813, -0.0307228880406122, -0.014352014220121058, 0.0009422958201664586, 8.740501611412016e-05, -0.015581447557094574, -0.02451101492621291, -0.007557778583323442, 0.027979310469022408, 0.004024775679659653, -0.01621557578191182, -0.009298397183945878, -0.017186181538113313, -0.013666120292884897, -0.010864306189658501, -0.011614907477748954, -0.0335182285206532, -0.020615652105616698, 0.002266362997180508, -0.0016055426018403084, 0.006363934341385915, -0.035511206666797676, -0.01468849082018274, 0.007674251180935364, 0.010476064818500474, 0.02163802263291454, 0.0037141821636380735, -0.0013337732462175966, 0.01730265460138652, 0.024083947648426196, -0.0284452008594701, 0.03199114635289183, -0.01239139208271015, -0.03085230089967564, 0.012436686955911382, -0.015995573176445916, 0.002242097853275471, -0.0008654562366056139, -0.007551307754105755, 0.005179795877613633, 0.026866350195322103, 0.01882973863179303, -0.02810872332808585, 0.0005047146672618781, 0.00622804972178222, -0.026232220107859718, 0.026025159160829186, 0.010243119623276628, -0.01422260042973505, -0.025287499531174106, -0.049332622204294185, 0.005571273594703074, -0.005558331936267702, -0.0073313042173172825, -0.004869202128761412, 0.007726016883354282, -0.0009236925354115751, -0.0004658904486553504, 5.884293540714049e-05, 0.019813286046429898, -0.014830846269004121, -0.030748769494837805, -0.0014745109877345565, 0.026426341259100018, 0.018350906582909967, -0.006402758385369461, 0.04304310286457295, -0.005018028406800479, -0.012960813877995676, 0.017574423840593913, -0.019062684758339443, 0.0073313042173172825, 0.029558161133170405, -0.02678870210735501, 0.014882611971423038, 0.009175453291454983, -0.023126285413305213, 0.006056576007142537, -0.008334262256962068, 0.004021540730712095, 0.018506204621489292, 0.006105106294952612, -0.005212149558040778, -0.01218432927303448, -0.005357740421471002, 0.003146378308487612, 0.012307273165525373, -0.004927438660398015, -0.01608616292284838, -0.024019241218894476, -0.021068600837629015, -0.02096507129543632, -0.013640236976014153, 0.02184508544259021, -0.0303346457381316, -0.018117962319008692, 0.025171026467900898, -0.007176007110060529, 0.02772048288825039, -0.00589157312172054, 0.018130903977444063, -0.008593090769056656, 0.00968016865720879, 0.011821970287424625, -0.028678145123371374, -0.015879100113172707, -0.003798301313653494, 0.01635793216205577, -0.010281943667260174, 0.021301546964175428, 0.02618045533676337, 0.011233136935808612, 0.01870032577272959, 0.003898597303882486, 0.01218432927303448, -0.023294523247674767, 0.02256980527645506, -0.016267342415653308, 0.017134414904371825, 0.008450735320235275, 0.021185073900902223, 0.013219641458767695, 0.002056065238557279, -0.011278431809009844, -0.0030493177328674626, 0.01455907702979673, 0.022142737998668345, -0.010592536950451112, -0.03134407460699407, 0.0028001956974300457, -0.00704659331967452, 0.003756241855061105, -0.004998616384808706, -0.008599561598274342, -0.006160106946319088, 0.018389731558216084, -0.002429748175298084, 0.029868755347683913, -0.01822149372384653, 0.014261424473718596, -0.0323017387047602, 0.017380302689353613, 0.001570762593287174, 0.015283795001016438, -0.009667226998773417, -0.004464783684559169, -0.0012011238083921016, 0.006658351017193922, 0.03895361703009129, 0.01801443091417086, 0.008515442681089565, 0.015736743733028757, -0.010036056813600957, 0.004403311738313722, 0.03012758292845593, 0.00938898693034834, -0.007751899734563741, 0.002633575104703627, -0.0027613716534465, -0.024187479053264033, -0.009123688520358636, -0.02979110725971682, -0.018325025128684364, 4.7771969321233245e-05, 0.012714927024336459, -0.019645048212060343, 0.012164917716703992, 0.017276771284515775, -0.008327792359066952, -0.006580702929226831, 0.02051212070077886, 0.03763359394671532, 0.01969681298315669, 0.019062684758339443, 0.01184785360429537, -0.0015853217261963249, -0.009537813208387409, -0.06222225696089532, 0.009731934359627707, -0.008062493017754677, 0.01897209501193698, 0.02650399120971225, -0.0237862969549932, 0.010262531179607117, 0.021288605305740057, 0.020007407197670198, -0.02276392456505022, -0.029221685464431296, -0.01708265013327548, 0.02096507129543632, 0.004290074555310643, 0.00424477968210941, -0.022608628389116037, 0.006671292675629294, 0.03633945418021008, 0.004079776796687413, -0.010993720911367081, -0.02449807326777754, -0.016267342415653308, -0.0034424129244306828, 0.0015998807426901545, 0.01285728247315784, 0.026736935473613523, -0.013407291780790307, -0.02826402136666517, -0.004315957406520101, -0.011453140472597085, -0.00021595963232529813, 0.017807368104495187, -0.04105012471842847, 0.036235924638017385, -0.014248482815283223, 0.011032544955350627, -0.012559629917079705, 0.00707894653444038, -0.01596968985957517, -0.0008783976040026828, -0.008133671207826654, 0.03683122975017365, 0.004267427118710027, -0.007557778583323442, -0.00855426672507311, -0.019230922592709, 0.02171567258352677, 0.002348864439891507, -0.016137927693944728, -0.0031787317560841148, 0.002449160197289857, -0.013251995604856124, 0.018881505265534518, 0.017250887967645033, -0.002981375423065615, 0.00642864123657892, 0.001735765478709171, -0.015141439552195057, -0.0253910290733668, 0.007066005341666293, 0.01168608566782093, 0.007176007110060529, -0.028626380352275025, -0.024135714282167684, -0.008754857774208527, 0.012093739526632017, -0.021793320671493862, -0.030489941914065784, 0.010042527642818645, 0.009227218993873902, -0.01728971294295115, 0.041360720795587114, -0.018881505265534518, -0.017781486650269584, -0.015581447557094574, 0.008528383408202366, 0.020667418739358186, -0.03344058229533125, 0.0307228880406122, -0.01035312092600958, -0.021198015559337594, -0.011789617072658765, 0.005674804533879625, 0.019386218768643184, 0.007997786588222958, -0.015400268064289646, 0.04203367213306533, -0.0025154850326179253, -0.01637087382049114, -0.022466273871617225, 0.006716587548830526, 0.009143100076689123, -0.006936591085618999, 0.012863753302375526, 0.014455545624958894, -0.03212055921195527, 0.00807543467619005, 0.015736743733028757, -0.0021887144435521317, -0.001517379299979156, -0.020615652105616698, -0.015879100113172707, -0.007518954539339896, -0.011239606833703729, 0.007836018651748518, -0.02979110725971682, -0.02839343422572861, -0.007971903271352214, -0.009524871549952036, 0.023734530321251713, -0.0002596368709816842, 0.007590131798089301, 0.019230922592709, 0.03051582523093653, 0.009828994935247857, 0.008211319295793745, 0.010961366765278651, -0.01977446107112378, -0.00958310808158864, -0.02945462972833257, -0.028704028440242117, -0.0243945418629397, -0.01622851744034719, 0.008418382105469416, 0.0076160146492987605, -0.03025699765016451, -0.018687384114294218, 0.006587173292783232, -0.014235542088170421, -0.00894250809623114, -0.0006442391283005218, -0.004904791223797399, -0.014080244980913668, -0.0014292159981180034, -0.024938080341354484, 0.013575530546482433, -0.00698188595882023, 0.005005087214026392, 0.006833060146442449, -0.006393052607204217, -0.004176837372307563, -0.042499564386158165, -0.0001595432105810656, 0.01415789306888076, -0.013523764844063516, 0.0011436963632383513, -0.007499542051686838, -0.0035588855220426052, 0.016577934767521674, 0.003856537612459455, -0.010094293345237561, -0.009272513867075134, -0.018609736026327126, 0.011634319965402012, -0.025818096351153515, 0.017742661674963467, -0.008424852003364531, -0.01728971294295115, 0.014533193712925985, 0.005917455972929999, -0.0007125859231361354, -0.02756518484967107, -0.0035847681404214218, -0.013433175097661051, 0.0038727144526730273, 0.014442603966523523, -0.0019104743751270547, -0.008360145573832812, 0.01081901131645727, -0.013873182171237997, 0.0020010641215295185, 0.0020107703653560473, -0.016241459098782562, 0.03916067983976696, -0.024549838038873884, 0.004011834486885565, -0.007286008878454766, 0.011821970287424625, -0.022660395022857525, 0.010967837594496337, 0.028496965630566445, -0.01145961130181477, 0.012850811643940155, -0.025093378379933806, 0.0006830633469070494, -0.03465707211122425, -0.010314296882026034, 0.01801443091417086, -0.0006373640468797123, -0.014520252985813184, 0.029092270742722717, 0.003188437767080001, -0.02216862131553909, 0.021418020027448637, -0.030541708547807273, -0.0009123688171112672, -0.005066558694610554, 0.0015861305798485356, 0.006338051490176457, -0.016927353957341296, -0.022945105920500287, 0.018389731558216084, -0.01375671003928736, -0.015335560703435357, 0.002533279347305277, 0.0017406184842071143, 0.013873182171237997, 0.026012217502393815, 0.32446682299004137, 0.022660395022857525, -0.015244970957032892, 0.009621932125572186, -0.006930120722062598, 0.03160290405041123, 0.026555755980808598, 0.017406186006224355, 0.005762158865673246, 0.015464974493821365, -0.008243672510559604, 0.005917455972929999, -0.013012579580414593, -0.0004019922906992352, 0.011899618375391717, -0.05153265943482405, -0.04687375553034716, -0.030955832304513476, -0.009039568671851288, -0.047469060642503426, 0.02184508544259021, 0.011369021555412308, -0.014054361664042924, -0.013989655234511204, 0.012883164858706013, 0.027151061092964866, 0.011110193043317718, -0.0076483683297259054, 0.006800706466015304, 0.007053063683230921, -0.00480773064817725, 0.005969221675348916, -0.004597432889554021, 0.0010450181967291015, -0.008431322832582218, 0.004519784335925644, 0.005703922799697927, -0.0012941401157511975, 0.010747834057707864, 0.006111577124170297, 0.0005532448968642922, -0.018079137343702575, -0.009071922817939719, 0.0021838615544695094, -0.006499818960989611, 0.018557969392585638, -0.017561482182158538, -0.027875779064184574, -0.006994827617255602, 0.04718435160750581, -0.01669440783079488, -0.008036610632206504, 0.02324275661393328, 0.05575155812836915, 0.011259019321356785, 0.013090227668381684, -0.005513037063066471, 0.001892679944024382, -0.024666311102147093, 0.017677955245431747, -0.0062604029365480795, 0.029972286752521747, -0.0005738702575420418, 0.006833060146442449, -0.013536706502498887, 0.012993167092761536, -0.01916621616317728, 0.017470892435756075, 0.004128307084497488, -0.039755984951923236, 0.012041973824213098, -0.016332048845185024, -0.001949298651941243, -0.013976713576075833, -0.03577003238492456, -0.03530414385712201, 0.02545573736554366, -0.0014963495706829615, 0.038513611819159486, 0.047857302944984026, 0.0052833272824514685, 0.0028034311120388885, 0.010877247848093874, 0.020667418739358186, -4.084629816737272e-05, -0.012210212589905223, -0.017263829626080404, -0.0024443073082072346, -0.00542568273127285, -0.006548349248799686, 0.003759477269669948, -0.024058066194200593, -0.00781013580053906, -0.004759200360367175, 0.0021304781447461702, 0.012579042404732763, 0.02290628094519417, 0.02638751814643904, -0.01716029822124257, 0.00828249655454315, -0.047236114515957016, 0.018079137343702575, 0.029351098323494734, 0.006011281133941305, -0.04107600803529921, 0.00587539651433761, 0.01510261550821151, -0.017574423840593913, -0.0015659095877892308, -0.010728421570054806, -0.02676281879048427, -0.00948604750596849, 0.01802737257260623, -0.0037465358440652184, 0.008968391413101883, -0.008929567369118338, -0.008049551359319305, -0.02302275400846738, 0.02145684314010961, -0.028833443161950696, 0.028548732264307933, -0.034708838744965734, -0.0071177710440852105, 0.017781486650269584, -1.5140429169069807e-05, -0.006198931455963918, -0.004749494582201932, -0.005157148441013017, 0.0056489216826701655, -0.015710862278803153, 0.020007407197670198, -0.020201528348910494, 0.01329081964883967, -0.0003611459961698413, -0.007777782120111915, 0.010922542721295106, 0.01274728023910232, 0.007337774580873683, -0.0072018899612699885, -0.012572571575515078, -0.005189502121440162, -0.0019137097897358978, -0.009498989164403863, -0.004956556926216317, 0.032275855387889456, -0.012268449121541828, 0.003222408921980925, -0.0037950658990446506, 0.02857461558117868, -0.021676847608220657, -0.03432059644248514, -0.0153614440203061, 0.006422170407361234, -0.006651880187976236, 0.027953427152151666, -0.007486600858912751, -0.005386858221628019, 0.013329643692823216, -0.002572103624119465, 0.015749685391464128, -0.017949724484639138, 0.0012464186815933334, 0.02002034885610557, -0.013847299785689823, -0.006697175526838753, -0.011906089204609402, -0.16368284743746012, 0.0024653370375034294, 0.008288967383760837, -0.01462378345932845, 0.025041611746192318, 0.023592175803752904, 0.00047842744743397444, 0.019476808515045647, -0.017613246953254887, 0.022621570047551408, 0.003863008441677141, 0.00023618058072885756, -0.019205039275838255, -0.0013564206828182125, -0.018182668748540413, 0.015374384747418902, -0.018557969392585638, -0.0171473565628072, 0.009188394949890356, 0.02810872332808585, 0.033492348929072734, -0.0021838615544695094, -0.005069794109219396, 0.014714373205730913, 0.0008161171533509891, 0.004235073438282882, 0.01329081964883967, 0.008890743325134792, -0.006370404704942316, -0.013103169326817057, -0.026283986741601206, 0.007370128261300828, 0.004160660764924633, -0.006813648124450676, 0.019321512339111464, 0.011576083433765408, -0.0029651985828520427, -0.01422260042973505, -0.01032076771124372, 0.014779080566585202, 0.02115919244667662, 0.010139588218438793, 0.011944913248592948, 0.02082271491529237, -0.0180403142310416, 0.019114449529435792, 0.0015311295792360962, -0.0028147548303391963, 0.0036688872904368417, 0.012585513233950449, 0.012579042404732763, -0.016629701401263162, 0.01428730779058934, -0.008826035964280502, 0.00026529873013183815, 0.03828066569261307, 0.014326131834572885, 0.03017934769955228, -0.012999637921979222, 0.0163449905036204, -0.022285094378812296, -0.009363103613477596, 0.02738400535686614, 0.002669163966908972, 0.015542623513111028, 0.008146611934939455, -0.018557969392585638, -0.008877801666699419, 0.005195972950657848, 0.014921436015406584, -0.010793128930909095, -0.021819202125719465, 0.009091334374270206, -0.0151673228690658, -0.0010385474839267367, 0.013782592424835535, -0.010812541418562154, 0.010443710672412045, 0.018480321304618547, -0.0118931484774966, -0.030593473318903622, -0.008903684052247594, -0.03465707211122425, 0.01496026005939013, -0.023928651472492013, -0.005364210785027403, 0.009828994935247857, -0.001203550369348734, 0.018389731558216084, -0.024187479053264033, 0.02449807326777754, -0.043845467061114606, -0.018480321304618547, 0.005597155980251248, 0.008327792359066952, 0.0038888910600559574, -0.006231284670729778, 0.008806623476627444, -0.014209658771299677, -0.02564985851678396, -0.006072752614525467, -0.010074880857584503, 0.009110746861923265, 0.004762435774976018, -0.00834720391539744, 0.009596048808701442, 0.01242374529747601, 0.011491964516580631, 0.04200778881619459, -0.009440752632767259, -0.03631357458862962, 0.009906643023214949, 0.024472189950906793, 0.04053246955688443, 0.010896660335746931, 0.025882802780685235, -0.016746174464536367, -0.006147165753545, 0.00616334236092793, -0.0008848683168050475, 0.023721588662816342, 0.014002596892946577, 0.01521908857148472, 0.009783699130724055, -0.02171567258352677, -0.02484749059495202, -0.05502684015714944, -0.0029215214169552323, 0.016862645665164436, 0.01301905040963228, 0.02530043932696434, 0.02171567258352677, 0.01609910271863861, -0.0072018899612699885, -0.0186485591389881, -0.0036106509916308807, -0.011343138238541564, -0.009071922817939719, -0.001621719442053881, 0.007376599090518515, 0.01470143247861811, -0.023682765550155367, -0.0037659478660569913, -0.020356824524844677, -0.03439824639309737, 0.01682382255250346, -0.009414869315896515, -0.013523764844063516, 0.00435801686511249, -0.013782592424835535, 0.0006470700578755988, -0.00751248371012221, -0.024756900848549555, 0.018868563607099147, 0.004869202128761412, 0.004691258050565327, 0.011944913248592948, -0.04777965299437179, 0.012171388545921678, -0.020279176436877586, -0.00671011671961284, 0.009291926354728192, -0.032275855387889456, -0.006697175526838753, 0.027202825864061215, -0.021120367471370503, 0.000978693477816354, 0.02370864700438097, -0.0014850258523826537, -0.004620080326154637, -0.006056576007142537, -3.804064163094853e-05, -0.0280569585569895, 0.025688681629444935, -0.0023165109922950043, -0.00153679143838625, -0.045398436271037, 0.019010918124597955, -0.002140184388572699, 0.0029328451352555406, 0.02759106816654181, 0.00022081266692707165, -0.0006207828380476951, 0.019010918124597955, -0.004383899716321949, -0.03124054320215624, 0.0035847681404214218, -0.01182844111664231, -0.015982631518010545, -0.02364394057484925, 0.012223154248340595, -0.008599561598274342, 0.0005564802532654746, 0.004238308852891725, 0.016124986035509353, 0.0031285837609696184, -0.017483834094191446, 0.013614354590465979, -0.01697911872843764, 0.011906089204609402, -0.02363099891641388, -0.0015942188835400006, -0.01690147064047055, -0.0007328068569877797, -0.001520614714587999, -0.05207620163852912, 0.0014664225676277703, -0.0292475669186569, 0.014067303322478296, -0.0218968520763317, 0.022220386086635437, 0.01829914181181362, 0.00435801686511249, -0.02464042778527635, -0.003520061245228417, -0.01889444692396989, -0.04120542089436265, 0.021599199520253565, -0.004833613499386709, -0.020317999549538564, -0.0016823821854011533, 0.021055661041838782, -0.005503330819239941, -0.003710946981859873, 0.002389306191179474, -0.002327834710595312, 0.010087822516019876, -0.011200782789720183, -0.07345539296538135, 0.015244970957032892, -0.0085736782814036, 0.008004256486118074, 0.009764287574393567, -0.009693109384321592, 0.006289521202366382, -0.013937889532092287, -0.0069042378708531384, 0.011563142706652606, 0.006247461743773993, 0.0186485591389881, -0.009816053276812484, -0.0025462207729100065, -0.021521551432286474, -0.012216683419122909, 0.010230177964841257, 0.017276771284515775, 0.03147348746605751, 0.011543730218999548, 0.0016152487292515163, 0.003497413808627801, 0.015594389215529945, 0.02937698164036548, -0.013245524775638439, 0.01088371867731156, -0.024989846975095972, 0.0026901939290358092, -0.02003329051454094, 0.0008173303756216446, -0.006651880187976236, -0.037374768228588436, 0.016396755274716745, 0.013407291780790307, 0.001965475259324173, -0.014649666776199194, 0.022349800808344016, 0.029558161133170405, -0.004904791223797399, 0.044751368250429524, -0.014132010683332585, -0.02196155850586342, 0.029351098323494734, -0.016383813616281374, -0.01171843888258679, 0.01168608566782093, -0.027953427152151666, 0.013446115824773853, 0.03727123496110546, -0.038979500346962036, 0.00894250809623114, 0.009479576676750805, 0.01162137830696664, -0.031758200226345415, -0.003432706680604154, -0.018208552065411155, 0.021689789266656028, 0.003159319734092342, 0.010974308423714023, -0.0311370117973184, 0.009809582447594799, 0.009686639486426476, 0.008754857774208527, -0.0010159000473261208, 0.015439092108273192, -0.013025520307527395, 0.004128307084497488, -0.00895544975466651, 0.037892423390132476, -0.04780553631124254, -0.02940286495723622, -0.005885102292502854, 0.011640790794619698, 0.01202256226788261, -0.006279814958539852, -0.0005957088986981076, 0.006567761270791459, -0.022996870691596633, -0.01789795785089765, 0.02823813804979443, 0.00858661993983897, -0.0004917732416571486, -0.00885838917904636, 0.00495979234082516, 0.03530414385712201, 0.03639122081395157, -0.008864860008264048, -0.008929567369118338, -0.022880397628323425, 0.016526169996425324, 0.011123134701753091, -0.008916625710682965, -0.005179795877613633, 0.009304868013163563, 0.005354505006862159, 0.024291010458101867, -0.012980225434326163, 0.001899150656826747, 0.011983738223899065, 0.022660395022857525, 0.009777229232828939, -0.0010450181967291015, 0.01138196228252511, -0.009660756169555732, -0.028626380352275025, -0.0009366339610163046, -0.0345794258859023, -0.030800536128579293, -0.0072018899612699885, 0.007855431139401577, -0.0006078414706506264, 0.027125177776094123, -0.010171941433204653, 0.018519146279924664, -0.01295434304877799, 0.004464783684559169, -0.0013305378316087536, -0.003685064130650414, -0.028496965630566445, 0.019800344387994526, 0.03051582523093653, 0.006422170407361234, 0.03574415279334409, 0.012113152014285073, 0.023669823891719996, 0.011666673180167873, 0.024329835433407984, -0.026025159160829186, -0.012805516770738922, -0.015853216796301965, 0.005047146672618781, -0.007557778583323442, -0.009033097842633602, -0.01983916936330064, -0.029765223942846076, 0.005325386741043857, -0.011970796565463692, 0.013937889532092287, -0.02264745336442215, 0.027616951483412554, 0.0008686915930067963, -0.012158446887486305, 0.009660756169555732, -0.012171388545921678, -0.0021693024215603584, 0.024071007852635964, -0.0057233348216897, -0.040713649049689354, 0.0023359230142867772, 0.01728971294295115, -0.016383813616281374, 0.006703645890395154, -0.027280473952028306, -0.022815691198791708, 0.007958961612916841, 0.026400459804874415, -0.010139588218438793, -0.026866350195322103, 0.013103169326817057, 0.01428730779058934, 0.005166854684839546, -0.018467379646183175, 0.017470892435756075, -0.011550201048217235, -0.009964879554851553, 0.0159567482011398, -0.005186266706831319, -0.005254209016633166, -0.024148655940603055, 0.01877797386069668, -0.006153636582762687, -0.021625082837124308, -0.00288916796935873, 0.020240351461571472, -0.02324275661393328, -0.0016856176000099963, -0.01239139208271015, 0.012643749765587053, 0.0112525484921391, -0.026426341259100018, 0.0015100998499399017, -0.017393244347788984, -0.016875587323599808, 0.03703829255984933, -0.021547432886512077, -0.007066005341666293, -0.02110742581293513, -0.006787765273241216]\n"
     ]
    },
    {
     "data": {
      "text/plain": "[0.0007671824387148089,\n -0.01789795785089765,\n 0.025727506604751052,\n 0.0006652689158043768,\n -0.01870032577272959]"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "\n",
    "embedding_model = OpenAIEmbeddings()\n",
    "embeddings = embedding_model.embed_documents([\n",
    "\"我在这里!\",\n",
    "\"你好啊!\",\n",
    "\"你叫什么名字?\",\n",
    "\"大家都叫我小明\",\n",
    "\"小明你好!\"\n",
    "])\n",
    "\n",
    "#向量数组的长度， 数据的每个item存放一句话\n",
    "print(len(embeddings))\n",
    "\n",
    "#向量的维度是 1536\n",
    "print(len(embeddings[3]))\n",
    "\n",
    "embedding_query = embedding_model.embed_query(\"对话中提到了谁?\")\n",
    "embedding_query[:5]\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T15:16:43.969662Z",
     "start_time": "2024-07-03T15:16:42.812311Z"
    }
   },
   "id": "bd94291edac4ac02",
   "execution_count": 3
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 12.Chroma"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "1d4d7e638d74c9fa"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting sentence_transformers\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/58/4b/922436953394e1bfda05e4bf1fe0e80f609770f256c59a9df7a9254f3e0d/sentence_transformers-3.0.1-py3-none-any.whl (227 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m227.1/227.1 kB\u001B[0m \u001B[31m621.5 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: transformers<5.0.0,>=4.34.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (4.40.1)\r\n",
      "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (4.66.2)\r\n",
      "Requirement already satisfied: torch>=1.11.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (2.3.0)\r\n",
      "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (1.26.4)\r\n",
      "Collecting scikit-learn (from sentence_transformers)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/f9/4b/c035ce6771dd56283cd587e941054ebb38a14868729e28a0f7c6c9ff9ebd/scikit_learn-1.5.0-cp312-cp312-macosx_12_0_arm64.whl (11.0 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m11.0/11.0 MB\u001B[0m \u001B[31m615.4 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n",
      "\u001B[?25hCollecting scipy (from sentence_transformers)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/5c/76/f2b91ea2d2b76504e845699271be9c0ca3492770614fb6b911fb517023de/scipy-1.14.0-cp312-cp312-macosx_12_0_arm64.whl (29.9 MB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m29.9/29.9 MB\u001B[0m \u001B[31m646.2 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:02\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: huggingface-hub>=0.15.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (0.22.2)\r\n",
      "Requirement already satisfied: Pillow in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sentence_transformers) (10.3.0)\r\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.13.4)\r\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2024.3.1)\r\n",
      "Requirement already satisfied: packaging>=20.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (23.2)\r\n",
      "Requirement already satisfied: pyyaml>=5.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0.1)\r\n",
      "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.31.0)\r\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.11.0)\r\n",
      "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (1.12)\r\n",
      "Requirement already satisfied: networkx in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.3)\r\n",
      "Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.3)\r\n",
      "Requirement already satisfied: regex!=2019.12.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2024.4.16)\r\n",
      "Requirement already satisfied: tokenizers<0.20,>=0.19 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.19.1)\r\n",
      "Requirement already satisfied: safetensors>=0.4.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.3)\r\n",
      "Requirement already satisfied: joblib>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn->sentence_transformers) (1.4.2)\r\n",
      "Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl (18 kB)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.5)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.7)\r\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2.2.1)\r\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2024.2.2)\r\n",
      "Requirement already satisfied: mpmath>=0.19 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\r\n",
      "Installing collected packages: threadpoolctl, scipy, scikit-learn, sentence_transformers\r\n",
      "Successfully installed scikit-learn-1.5.0 scipy-1.14.0 sentence_transformers-3.0.1 threadpoolctl-3.5.0\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1.1\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: chromadb in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.5.0)\r\n",
      "Requirement already satisfied: build>=1.0.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.2.1)\r\n",
      "Requirement already satisfied: requests>=2.28 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (2.31.0)\r\n",
      "Requirement already satisfied: pydantic>=1.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.10.17)\r\n",
      "Requirement already satisfied: chroma-hnswlib==0.7.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.7.3)\r\n",
      "Requirement already satisfied: fastapi>=0.95.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.111.0)\r\n",
      "Requirement already satisfied: uvicorn>=0.18.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.29.0)\r\n",
      "Requirement already satisfied: numpy>=1.22.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.26.4)\r\n",
      "Requirement already satisfied: posthog>=2.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.5.0)\r\n",
      "Requirement already satisfied: typing-extensions>=4.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.11.0)\r\n",
      "Requirement already satisfied: onnxruntime>=1.14.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.17.3)\r\n",
      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: tokenizers>=0.13.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.19.1)\r\n",
      "Requirement already satisfied: pypika>=0.48.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.48.9)\r\n",
      "Requirement already satisfied: tqdm>=4.65.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.66.2)\r\n",
      "Requirement already satisfied: overrides>=7.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (7.7.0)\r\n",
      "Requirement already satisfied: importlib-resources in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.4.0)\r\n",
      "Requirement already satisfied: grpcio>=1.58.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.62.2)\r\n",
      "Requirement already satisfied: bcrypt>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.3)\r\n",
      "Requirement already satisfied: typer>=0.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.9.4)\r\n",
      "Requirement already satisfied: kubernetes>=28.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (29.0.0)\r\n",
      "Requirement already satisfied: tenacity>=8.2.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (8.3.0)\r\n",
      "Requirement already satisfied: PyYAML>=6.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.0.1)\r\n",
      "Requirement already satisfied: mmh3>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.0)\r\n",
      "Requirement already satisfied: orjson>=3.9.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.10.3)\r\n",
      "Requirement already satisfied: packaging>=19.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (23.2)\r\n",
      "Requirement already satisfied: pyproject_hooks in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (1.1.0)\r\n",
      "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.37.2)\r\n",
      "Requirement already satisfied: fastapi-cli>=0.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.3)\r\n",
      "Requirement already satisfied: httpx>=0.23.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.27.0)\r\n",
      "Requirement already satisfied: jinja2>=2.11.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (3.1.3)\r\n",
      "Requirement already satisfied: python-multipart>=0.0.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.9)\r\n",
      "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (5.10.0)\r\n",
      "Requirement already satisfied: email_validator>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (2.1.1)\r\n",
      "Requirement already satisfied: certifi>=14.05.14 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2024.2.2)\r\n",
      "Requirement already satisfied: six>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.9.0.post0)\r\n",
      "Requirement already satisfied: google-auth>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.29.0)\r\n",
      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\r\n",
      "Requirement already satisfied: requests-oauthlib in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.0)\r\n",
      "Requirement already satisfied: oauthlib>=3.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\r\n",
      "Requirement already satisfied: urllib3>=1.24.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: coloredlogs in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (15.0.1)\r\n",
      "Requirement already satisfied: flatbuffers in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\r\n",
      "Requirement already satisfied: protobuf in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.3)\r\n",
      "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\r\n",
      "Requirement already satisfied: deprecated>=1.2.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\r\n",
      "Requirement already satisfied: importlib-metadata<=7.0,>=6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (7.0.0)\r\n",
      "Requirement already satisfied: googleapis-common-protos~=1.52 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.63.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-proto==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-semantic-conventions==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-util-http==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: setuptools>=16.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (69.5.1)\r\n",
      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: asgiref~=3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (3.8.1)\r\n",
      "Requirement already satisfied: monotonic>=1.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (1.6)\r\n",
      "Requirement already satisfied: backoff>=1.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.7)\r\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tokenizers>=0.13.2->chromadb) (0.22.2)\r\n",
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (8.1.7)\r\n",
      "Requirement already satisfied: h11>=0.8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn>=0.18.3->uvicorn[standard]>=0.18.3->chromadb) (0.14.0)\r\n",
      "Requirement already satisfied: httptools>=0.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\r\n",
      "Requirement already satisfied: python-dotenv>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\r\n",
      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\r\n",
      "Requirement already satisfied: watchfiles>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.21.0)\r\n",
      "Requirement already satisfied: websockets>=10.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (12.0)\r\n",
      "Requirement already satisfied: dnspython>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from email_validator>=2.0.0->fastapi>=0.95.2->chromadb) (2.6.1)\r\n",
      "Collecting typer>=0.9.0 (from chromadb)\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/20/b5/11cf2e34fbb11b937e006286ab5b8cfd334fde1c8fa4dd7f491226931180/typer-0.12.3-py3-none-any.whl (47 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m47.2/47.2 kB\u001B[0m \u001B[31m634.4 kB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hRequirement already satisfied: shellingham>=1.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (1.5.4)\r\n",
      "Requirement already satisfied: rich>=10.11.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (13.7.1)\r\n",
      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.3)\r\n",
      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.0)\r\n",
      "Requirement already satisfied: rsa<5,>=3.1.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\r\n",
      "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (3.7.1)\r\n",
      "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.0.5)\r\n",
      "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.3.1)\r\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.13.4)\r\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.3.1)\r\n",
      "Requirement already satisfied: zipp>=0.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from importlib-metadata<=7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.18.1)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2>=2.11.2->fastapi>=0.95.2->chromadb) (2.1.5)\r\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (3.0.0)\r\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (2.17.2)\r\n",
      "Requirement already satisfied: humanfriendly>=9.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb) (10.0)\r\n",
      "Requirement already satisfied: mpmath>=0.19 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\r\n",
      "Requirement already satisfied: mdurl~=0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.9.0->chromadb) (0.1.2)\r\n",
      "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.0)\r\n",
      "Installing collected packages: typer\r\n",
      "  Attempting uninstall: typer\r\n",
      "    Found existing installation: typer 0.9.4\r\n",
      "    Uninstalling typer-0.9.4:\r\n",
      "      Successfully uninstalled typer-0.9.4\r\n",
      "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\r\n",
      "spacy 3.7.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\r\n",
      "weasel 0.3.4 requires typer<0.10.0,>=0.3.0, but you have typer 0.12.3 which is incompatible.\u001B[0m\u001B[31m\r\n",
      "\u001B[0mSuccessfully installed typer-0.12.3\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1.1\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install sentence_transformers\n",
    "%pip install chromadb"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T15:28:28.894842Z",
     "start_time": "2024-07-03T15:27:09.376725Z"
    }
   },
   "id": "7b9141ee738c7dc3",
   "execution_count": 7
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 0.3.0. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFaceEmbeddings`.\n",
      "  warn_deprecated(\n",
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  from tqdm.autonotebook import tqdm, trange\n"
     ]
    },
    {
     "data": {
      "text/plain": "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "93d1dacf39cd48a392c4bd45b339dd2e"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "85dcf450d95f4b1b96609a343deff57d"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "51e9bc929034444da5927c5d2855c3fd"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "6a4906a111734d5bb98613a3d74da437"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "1173ceb00f5d46a9b9c7ba55749fb07a"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "ec5963ac3cff43e29b5a47da280f25ba"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "9c0db4d2e3d64d5585b5ac875018f3a5"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "4d2e9d1c35f7494aa50e48715860cbe1"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "2d8f2722810e4b23a16c64c6b6ec216f"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "2ab0e102505a4a4992c1978a6a1cfbaa"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "75221362879c49d69ee98c7019c2095d"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "小明说：“今天天气真好啊”\n",
      "小红说：“冰淇淋真好吃”\n",
      "小刚在打篮球\n",
      "小非在踢足球\n"
     ]
    }
   ],
   "source": [
    "from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma\n",
    "from langchain.document_loaders import TextLoader\n",
    "\n",
    "#加载文档\n",
    "loader = TextLoader(\"./file/test.txt\")\n",
    "documents = loader.load()\n",
    "\n",
    "#将文档切割成小块\n",
    "text_splitter = CharacterTextSplitter(chunk_size=10,chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
    "#使用句子转换器对文本进行Embedding\n",
    "embedding_functions = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\");\n",
    "#将向量保存到Chroma中\n",
    "db = Chroma.from_documents(docs, embedding_functions)\n",
    "\n",
    "#查询，返回最相似的文档块（chunk）\n",
    "query = \"小明在干什么？\"\n",
    "answer_docs = db.similarity_search(query)\n",
    "\n",
    "print(answer_docs[0].page_content)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T15:33:25.925812Z",
     "start_time": "2024-07-03T15:32:58.827046Z"
    }
   },
   "id": "40b927c85734e14f",
   "execution_count": 2
  },
  {
   "cell_type": "markdown",
   "source": [
    "持久化"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "43bb932ca55edc32"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "小明说：“今天天气真好啊”\n",
      "小红说：“冰淇淋真好吃”\n",
      "小刚在打篮球\n",
      "小非在踢足球\n"
     ]
    }
   ],
   "source": [
    "#将 chroma 数据保存到磁盘上\n",
    "db_to_disk = Chroma.from_documents(docs,embedding_functions,persist_directory=\"./file/chroma_db\")\n",
    "db_to_disk.persist()\n",
    "# query_docs = db_to_disk.similarity_search(\"小明在干什么\")\n",
    "# print(query_docs[0].page_content)\n",
    "\n",
    "\n",
    "db_from_disk = Chroma(persist_directory=\"./file/chroma_db\",embedding_function = embedding_functions)\n",
    "docs = db_to_disk.similarity_search(query)\n",
    "print(docs[0].page_content)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T15:42:54.040948Z",
     "start_time": "2024-07-03T15:42:53.918407Z"
    }
   },
   "id": "54ecf2c33253d8db",
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 12.3定义metadata 以便查询"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "90a221867cdb2640"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['1', '2', '3', '4']\n",
      "{'source': 'specified-source'}\n",
      "{'ids': ['1'], 'embeddings': None, 'metadatas': [{'source': 'specified-source'}], 'documents': ['小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球'], 'uris': None, 'data': None}\n",
      "{'ids': ['2'], 'embeddings': None, 'metadatas': [{'source': './file/test.txt'}], 'documents': ['小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球'], 'uris': None, 'data': None}\n",
      "{'ids': ['3'], 'embeddings': None, 'metadatas': [{'source': './file/test.txt'}], 'documents': ['小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球'], 'uris': None, 'data': None}\n",
      "{'ids': ['4'], 'embeddings': None, 'metadatas': [{'source': './file/test.txt'}], 'documents': ['小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球'], 'uris': None, 'data': None}\n"
     ]
    }
   ],
   "source": [
    "# create simple ids\n",
    "ids = [str(i) for i in range(1,len(docs) + 1)]\n",
    "\n",
    "print(ids)\n",
    "# 插入文本块ID，针对每个文本块 配置对应的ID\n",
    "example_db = Chroma.from_documents(docs,embedding_functions,ids=ids)\n",
    "docs = example_db.similarity_search(query)\n",
    "print(docs[0].metadata)\n",
    "\n",
    "\n",
    "# 更新文档的metadata 加入source 的字段输入定义好的信息，为了后面查询做准备\n",
    "docs[0].metadata = {\"source\": \"specified-source\"}\n",
    "example_db.update_document(ids[0],docs[0])\n",
    "print(example_db._collection.get(ids = [ids[0]]))\n",
    "print(example_db._collection.get(ids = [ids[1]]))\n",
    "print(example_db._collection.get(ids = [ids[2]]))\n",
    "print(example_db._collection.get(ids = [ids[3]]))\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T16:03:31.071455Z",
     "start_time": "2024-07-03T16:03:30.999009Z"
    }
   },
   "id": "ddc43a88f1a34c2b",
   "execution_count": 20
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "{'ids': ['0', '1'],\n 'embeddings': None,\n 'metadatas': [{'source': 'specified-source'}, {'source': 'specified-source'}],\n 'documents': ['小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球',\n  '小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球'],\n 'uris': None,\n 'data': None}"
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_db.get(where={'source': 'specified-source'})\n",
    "# example_db.get(where={'source': './file/test.txt'})"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-03T16:04:37.002044Z",
     "start_time": "2024-07-03T16:04:36.975182Z"
    }
   },
   "id": "b9d020f52b10e40d",
   "execution_count": 23
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 13 FAISS"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "7b021ca98b295d43"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "小明说：“今天天气真好啊”\n",
      "小红说：“冰淇淋真好吃”\n",
      "小刚在打篮球\n",
      "小非在踢足球\n"
     ]
    }
   ],
   "source": [
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import FAISS\n",
    "\n",
    "from langchain.document_loaders import TextLoader\n",
    "\n",
    "loader = TextLoader(\"./file/test.txt\")\n",
    "documents = loader.load()\n",
    "\n",
    "text_splitter = CharacterTextSplitter(chunk_size=10,chunk_overlap=0)\n",
    "docs = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
    "db = FAISS.from_documents(docs,embeddings)\n",
    "\n",
    "query = \"小明在干什么？\"\n",
    "answer = db.similarity_search(query)\n",
    "\n",
    "print(answer[0].page_content)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-08T15:12:53.346516Z",
     "start_time": "2024-07-08T15:12:51.123854Z"
    }
   },
   "id": "d276dc4a10454104",
   "execution_count": 4
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 13.1 查询结果通过score返回相似度"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "bcdb0c2bd6d33bb6"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "(Document(page_content='小明说：“今天天气真好啊”\\n小红说：“冰淇淋真好吃”\\n小刚在打篮球\\n小非在踢足球', metadata={'source': './file/test.txt'}),\n 0.2723874)"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs_and_score = db.similarity_search_with_score(query)\n",
    "docs_and_score[0]\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-08T15:14:28.199151Z",
     "start_time": "2024-07-08T15:14:27.803281Z"
    }
   },
   "id": "fa2c955407fa65c9",
   "execution_count": 6
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 13.2 合并库"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "864a5cfe4d35a1b1"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": "{'416dca34-89dd-4bc7-b860-db56df17bb4b': Document(page_content='你好'),\n 'fd77d83e-0b39-496f-b210-4c7351843d5f': Document(page_content='世界')}"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db1 = FAISS.from_texts([\"你好\"],embeddings)\n",
    "db2 = FAISS.from_texts([\"世界\"],embeddings)\n",
    "\n",
    "db1.merge_from(db2)\n",
    "db1.docstore._dict"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-08T15:18:46.655235Z",
     "start_time": "2024-07-08T15:18:45.366625Z"
    }
   },
   "id": "d3577549214d6157",
   "execution_count": 10
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 13.3将Page 作为metadata进行查询"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "36092c24e6f96048"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Content: 你好,Metadata : {'page': 1},score:0.0\n",
      "Content: 你好,Metadata : {'page': 2},score:0.0\n",
      "Content: 你好,Metadata : {'page': 3},score:0.0\n",
      "Content: 你好,Metadata : {'page': 4},score:0.0\n"
     ]
    }
   ],
   "source": [
    "from langchain.schema import Document\n",
    "\n",
    "list_of_documents = [\n",
    "    Document(page_content=\"你好\", metadata=dict(page=1)),\n",
    "    Document(page_content=\"小明\", metadata=dict(page=1)),\n",
    "    Document(page_content=\"你好\", metadata=dict(page=2)),\n",
    "    Document(page_content=\"小红\", metadata=dict(page=2)),\n",
    "    Document(page_content=\"你好\", metadata=dict(page=3)),\n",
    "    Document(page_content=\"小熊\", metadata=dict(page=3)),\n",
    "    Document(page_content=\"你好\", metadata=dict(page=4)),\n",
    "    Document(page_content=\"小鸟\", metadata=dict(page=4)),\n",
    "]\n",
    "\n",
    "db = FAISS.from_documents(list_of_documents,embeddings)\n",
    "\n",
    "results_with_scores = db.similarity_search_with_score(\"你好\")\n",
    "\n",
    "for doc,score in results_with_scores:\n",
    "    print(f\"Content: {doc.page_content},Metadata : {doc.metadata},score:{score}\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-08T15:42:22.093267Z",
     "start_time": "2024-07-08T15:42:20.457382Z"
    }
   },
   "id": "e23e1f0e6af2b7f8",
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Content: 你好,Metadata : {'page': 1},score:0.0\n",
      "Content: 小明,Metadata : {'page': 1},score:0.32669496536254883\n"
     ]
    }
   ],
   "source": [
    "results_with_scores = db.similarity_search_with_score(\"你好\",filter=dict(page=1))\n",
    "\n",
    "for doc,score in results_with_scores:\n",
    "    print(f\"Content: {doc.page_content},Metadata : {doc.metadata},score:{score}\")\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-08T15:45:04.833007Z",
     "start_time": "2024-07-08T15:45:03.429943Z"
    }
   },
   "id": "b4a57369248a6e4a",
   "execution_count": 14
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 14.MultiQueryRetriever 多查询索引器"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e0ad7cc8a0aa9dc7"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: chromadb in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.5.0)\r\n",
      "Requirement already satisfied: build>=1.0.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.2.1)\r\n",
      "Requirement already satisfied: requests>=2.28 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (2.31.0)\r\n",
      "Requirement already satisfied: pydantic>=1.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.10.17)\r\n",
      "Requirement already satisfied: chroma-hnswlib==0.7.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.7.3)\r\n",
      "Requirement already satisfied: fastapi>=0.95.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.111.0)\r\n",
      "Requirement already satisfied: uvicorn>=0.18.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.29.0)\r\n",
      "Requirement already satisfied: numpy>=1.22.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.26.4)\r\n",
      "Requirement already satisfied: posthog>=2.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.5.0)\r\n",
      "Requirement already satisfied: typing-extensions>=4.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.11.0)\r\n",
      "Requirement already satisfied: onnxruntime>=1.14.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.17.3)\r\n",
      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: tokenizers>=0.13.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.19.1)\r\n",
      "Requirement already satisfied: pypika>=0.48.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.48.9)\r\n",
      "Requirement already satisfied: tqdm>=4.65.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.66.2)\r\n",
      "Requirement already satisfied: overrides>=7.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (7.7.0)\r\n",
      "Requirement already satisfied: importlib-resources in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.4.0)\r\n",
      "Requirement already satisfied: grpcio>=1.58.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.62.2)\r\n",
      "Requirement already satisfied: bcrypt>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.3)\r\n",
      "Requirement already satisfied: typer>=0.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.12.3)\r\n",
      "Requirement already satisfied: kubernetes>=28.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (29.0.0)\r\n",
      "Requirement already satisfied: tenacity>=8.2.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (8.3.0)\r\n",
      "Requirement already satisfied: PyYAML>=6.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.0.1)\r\n",
      "Requirement already satisfied: mmh3>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.0)\r\n",
      "Requirement already satisfied: orjson>=3.9.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.10.3)\r\n",
      "Requirement already satisfied: packaging>=19.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (23.2)\r\n",
      "Requirement already satisfied: pyproject_hooks in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (1.1.0)\r\n",
      "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.37.2)\r\n",
      "Requirement already satisfied: fastapi-cli>=0.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.3)\r\n",
      "Requirement already satisfied: httpx>=0.23.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.27.0)\r\n",
      "Requirement already satisfied: jinja2>=2.11.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (3.1.3)\r\n",
      "Requirement already satisfied: python-multipart>=0.0.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.9)\r\n",
      "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (5.10.0)\r\n",
      "Requirement already satisfied: email_validator>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (2.1.1)\r\n",
      "Requirement already satisfied: certifi>=14.05.14 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2024.2.2)\r\n",
      "Requirement already satisfied: six>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.9.0.post0)\r\n",
      "Requirement already satisfied: google-auth>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.29.0)\r\n",
      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\r\n",
      "Requirement already satisfied: requests-oauthlib in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.0)\r\n",
      "Requirement already satisfied: oauthlib>=3.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\r\n",
      "Requirement already satisfied: urllib3>=1.24.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: coloredlogs in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (15.0.1)\r\n",
      "Requirement already satisfied: flatbuffers in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\r\n",
      "Requirement already satisfied: protobuf in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.3)\r\n",
      "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\r\n",
      "Requirement already satisfied: deprecated>=1.2.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\r\n",
      "Requirement already satisfied: importlib-metadata<=7.0,>=6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (7.0.0)\r\n",
      "Requirement already satisfied: googleapis-common-protos~=1.52 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.63.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-proto==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-semantic-conventions==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-util-http==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: setuptools>=16.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (69.5.1)\r\n",
      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: asgiref~=3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (3.8.1)\r\n",
      "Requirement already satisfied: monotonic>=1.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (1.6)\r\n",
      "Requirement already satisfied: backoff>=1.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.7)\r\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tokenizers>=0.13.2->chromadb) (0.22.2)\r\n",
      "Requirement already satisfied: click>=8.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (8.1.7)\r\n",
      "Requirement already satisfied: shellingham>=1.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (1.5.4)\r\n",
      "Requirement already satisfied: rich>=10.11.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (13.7.1)\r\n",
      "Requirement already satisfied: h11>=0.8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn>=0.18.3->uvicorn[standard]>=0.18.3->chromadb) (0.14.0)\r\n",
      "Requirement already satisfied: httptools>=0.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\r\n",
      "Requirement already satisfied: python-dotenv>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\r\n",
      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\r\n",
      "Requirement already satisfied: watchfiles>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.21.0)\r\n",
      "Requirement already satisfied: websockets>=10.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (12.0)\r\n",
      "Requirement already satisfied: dnspython>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from email_validator>=2.0.0->fastapi>=0.95.2->chromadb) (2.6.1)\r\n",
      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.3)\r\n",
      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.0)\r\n",
      "Requirement already satisfied: rsa<5,>=3.1.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\r\n",
      "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (3.7.1)\r\n",
      "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.0.5)\r\n",
      "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.3.1)\r\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.13.4)\r\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.3.1)\r\n",
      "Requirement already satisfied: zipp>=0.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from importlib-metadata<=7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.18.1)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2>=2.11.2->fastapi>=0.95.2->chromadb) (2.1.5)\r\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (3.0.0)\r\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (2.17.2)\r\n",
      "Requirement already satisfied: humanfriendly>=9.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb) (10.0)\r\n",
      "Requirement already satisfied: mpmath>=0.19 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\r\n",
      "Requirement already satisfied: mdurl~=0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.9.0->chromadb) (0.1.2)\r\n",
      "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.0)\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1.2\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install chromadb"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-14T14:13:49.210970Z",
     "start_time": "2024-07-14T14:13:46.447264Z"
    }
   },
   "id": "164dccc72877e765",
   "execution_count": 1
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 14.1 通过MultiQueryRetriever生成多维度查询输入"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ddcbb4d5138c56dd"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "USER_AGENT environment variable not set, consider setting it to identify your requests.\n",
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "from langchain.vectorstores import  Chroma\n",
    "from langchain.document_loaders import WebBaseLoader\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.text_splitter import  RecursiveCharacterTextSplitter\n",
    "\n",
    "#加载文档\n",
    "loader = WebBaseLoader(\"https://chatgptzhanghao.com/\")\n",
    "data = loader.load()\n",
    "\n",
    "# 切割文档\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=0)\n",
    "splits = text_splitter.split_documents(data)\n",
    "\n",
    "# 存储到chroma的向量数据库中\n",
    "embeddings = OpenAIEmbeddings()\n",
    "vectordb = Chroma.from_documents(documents=splits,embedding=embeddings)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-14T14:46:45.110436Z",
     "start_time": "2024-07-14T14:46:36.727898Z"
    }
   },
   "id": "a34c17a228358e79",
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.\n",
      "  warn_deprecated(\n",
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.\n",
      "  warn_deprecated(\n",
      "INFO:langchain.retrievers.multi_query:Generated queries: ['1. GPT是什么？', '2. 请问GPT是指什么？', '3. 您能解释一下GPT是什么吗？']\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "page_content='想要购买\\xa0 ChatGPT 已注册好可以直接使用的朋友，可以直接 点此链接到某宝购买ChatGPT成品号，无需看下文，马上提升工作效率！永久免费使用的官方正版账号。也有功能更强大的ChatGPT会员付费账号（经济型拼车号或独立私人号都有），点此淘宝购买GPT高级账号如果分不清什么普通账号和高级账号的区别，点击ChatGPT4.0会员账号与3.5账号的区别，一文帮你解答，新手小白必看 \\n\\n\\n\\nChatGPT 是什么 \\n\\n\\n\\nChatGPT 是由 OpenAI 开发的一种大型语言模型，可以用于回答各种问题、语音对话，生成文本、进行对话等自然语言处理任务。简单来说，Chat GPT就是最新一代通用型超强 AI，拥有庞大的知识库，能够回答各种各样的问题。有什么问题，问ChatGPT就行了！ \\n\\n\\n\\n \\n\\n\\n\\nChatGPT 官网\\n \\n\\n\\n\\nChatGPT的官方网址是\\xa0https://chat.openai.com\\xa0\\xa0\\xa0请认准其官方域名 openai.com，不是这个域名的ChatGPT服务都是镜像站或者假冒伪劣网站，请注意辨别。（国内山寨版本很多）' metadata={'description': '本文提供了适合小白简单易学的 ChatGPT 最新使用教程，一步步手把手，超详细亲测有效。通过本文你将学会如何注册 ChatGPT 账号，如何使用 ChatGPT，适合中国地区的用户。买ChatGPT账号就到ChatGPT账号网。', 'language': 'en-US', 'source': 'https://chatgptzhanghao.com/', 'title': 'ChatGPT 怎么用最新详细教程-新手小白一看就会-ChatGPT 账号网'}\n",
      "page_content='你可以尝试用各种方式向 ChatGPT 提出各种各样的问题或者指令，通过这一步你将更能体会到 ChatGPT 的强大之处。来吧，朋友，让我们真正学会 Chat GPT 怎么用。比如：叫 ChatGPT 写一首诗叫 ChatGPT 写代码叫 ChatGPT进行翻译叫 ChatGPT 编故事 \\n\\n\\n\\n \\n\\n\\n\\n \\n\\n\\n\\n \\n\\n\\n\\n \\n\\n\\n\\n\\xa0好啦，到这里为止，我们已经一步步教会了你怎麽注冊 ChatGPT账号，怎么通过 ChatGPT 手机号验证，怎么用 ChatGPT。\\xa0祝大家玩的愉快！ \\n\\n\\n\\n如果看到这里还是觉得 ChatGPT 账号注册过于麻烦的朋友，可以直接到某宝购买哦，\\xa0点此购买ChatGPT现成账号\\xa0打开看到 “皮特的号” 就是ChatGPT账号。这更适合不懂技术或不想折腾但是又想马上尽快体验到 ChatGPT 的朋友，而且账号可靠，售后保证，有什么不懂随时可以咨询，用起来也很省心。 \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n更多阅读' metadata={'description': '本文提供了适合小白简单易学的 ChatGPT 最新使用教程，一步步手把手，超详细亲测有效。通过本文你将学会如何注册 ChatGPT 账号，如何使用 ChatGPT，适合中国地区的用户。买ChatGPT账号就到ChatGPT账号网。', 'language': 'en-US', 'source': 'https://chatgptzhanghao.com/', 'title': 'ChatGPT 怎么用最新详细教程-新手小白一看就会-ChatGPT 账号网'}\n",
      "page_content='ChatGPT 怎么用最新详细教程-新手小白一看就会-ChatGPT 账号网\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSkip to content\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\nChatGPT 账号网 \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nChatGPT 怎么用最新详细教程-新手小白一看就会 \\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t本文最后更新时间：2024-05-06\\t\\t\\t\\t\\t\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n ChatGPT 持续火爆，相信大家都听说过他的大名。因为不仅功能十分强大，关键账号还是永久免费使用。但是ChatGPT不支持国内网络和国内用户，很多新手小白朋友还是不清楚 ChatGPT 是什么，怎么才能拥有账号，本文将手把手告诉大家，ChatGPT\\xa0是什么和怎么用怎么注册账号。' metadata={'description': '本文提供了适合小白简单易学的 ChatGPT 最新使用教程，一步步手把手，超详细亲测有效。通过本文你将学会如何注册 ChatGPT 账号，如何使用 ChatGPT，适合中国地区的用户。买ChatGPT账号就到ChatGPT账号网。', 'language': 'en-US', 'source': 'https://chatgptzhanghao.com/', 'title': 'ChatGPT 怎么用最新详细教程-新手小白一看就会-ChatGPT 账号网'}\n",
      "page_content='ChatGPT会员4.0是什么？与3.5有什么区别？ChatGPT可以用来干什么？秘籍宝典使用手册ChatGPT 登录不了怎么办？ChatGPT Plus是什么ChatGPT:sorry you have been blocked\\xa0 \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nChatGPT账号、PLUS账号 购买请扫码添加下方微信咨询 \\n\\n\\n\\n \\n\\n\\n\\n买 ChatGPT 账号就到 ChatGPT 账号网' metadata={'description': '本文提供了适合小白简单易学的 ChatGPT 最新使用教程，一步步手把手，超详细亲测有效。通过本文你将学会如何注册 ChatGPT 账号，如何使用 ChatGPT，适合中国地区的用户。买ChatGPT账号就到ChatGPT账号网。', 'language': 'en-US', 'source': 'https://chatgptzhanghao.com/', 'title': 'ChatGPT 怎么用最新详细教程-新手小白一看就会-ChatGPT 账号网'}\n"
     ]
    }
   ],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
    "\n",
    "question = \"什么是GPT？\"\n",
    "llm = ChatOpenAI(temperature=0)\n",
    "#创建多维度提问查询\n",
    "retriever_from_llm = MultiQueryRetriever.from_llm(\n",
    "    retriever=vectordb.as_retriever(),llm=llm\n",
    ")\n",
    "# Set logging for the queries\n",
    "import logging\n",
    "logging.basicConfig()\n",
    "logging.getLogger(\"langchain.retrievers.multi_query\").setLevel(logging.INFO)\n",
    "\n",
    "#通过多维度查询输入，获取文档内容\n",
    "unique_docs = retriever_from_llm.get_relevant_documents(query=question)\n",
    "len(unique_docs)\n",
    "\n",
    "for doc in unique_docs:\n",
    "    print(doc)\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-14T14:47:13.562466Z",
     "start_time": "2024-07-14T14:47:10.742035Z"
    }
   },
   "id": "bfdc878c153b8cac",
   "execution_count": 3
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 16.self filter"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "98cbdddfaea002a7"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Requirement already satisfied: chromadb in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.5.0)\r\n",
      "Requirement already satisfied: build>=1.0.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.2.1)\r\n",
      "Requirement already satisfied: requests>=2.28 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (2.31.0)\r\n",
      "Requirement already satisfied: pydantic>=1.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.10.17)\r\n",
      "Requirement already satisfied: chroma-hnswlib==0.7.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.7.3)\r\n",
      "Requirement already satisfied: fastapi>=0.95.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.111.0)\r\n",
      "Requirement already satisfied: uvicorn>=0.18.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.29.0)\r\n",
      "Requirement already satisfied: numpy>=1.22.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.26.4)\r\n",
      "Requirement already satisfied: posthog>=2.4.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.5.0)\r\n",
      "Requirement already satisfied: typing-extensions>=4.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.11.0)\r\n",
      "Requirement already satisfied: onnxruntime>=1.14.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.17.3)\r\n",
      "Requirement already satisfied: opentelemetry-api>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-fastapi>=0.41b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-sdk>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: tokenizers>=0.13.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.19.1)\r\n",
      "Requirement already satisfied: pypika>=0.48.9 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.48.9)\r\n",
      "Requirement already satisfied: tqdm>=4.65.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.66.2)\r\n",
      "Requirement already satisfied: overrides>=7.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (7.7.0)\r\n",
      "Requirement already satisfied: importlib-resources in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.4.0)\r\n",
      "Requirement already satisfied: grpcio>=1.58.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (1.62.2)\r\n",
      "Requirement already satisfied: bcrypt>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.3)\r\n",
      "Requirement already satisfied: typer>=0.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (0.12.3)\r\n",
      "Requirement already satisfied: kubernetes>=28.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (29.0.0)\r\n",
      "Requirement already satisfied: tenacity>=8.2.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (8.3.0)\r\n",
      "Requirement already satisfied: PyYAML>=6.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (6.0.1)\r\n",
      "Requirement already satisfied: mmh3>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (4.1.0)\r\n",
      "Requirement already satisfied: orjson>=3.9.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from chromadb) (3.10.3)\r\n",
      "Requirement already satisfied: packaging>=19.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (23.2)\r\n",
      "Requirement already satisfied: pyproject_hooks in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from build>=1.0.3->chromadb) (1.1.0)\r\n",
      "Requirement already satisfied: starlette<0.38.0,>=0.37.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.37.2)\r\n",
      "Requirement already satisfied: fastapi-cli>=0.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.3)\r\n",
      "Requirement already satisfied: httpx>=0.23.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.27.0)\r\n",
      "Requirement already satisfied: jinja2>=2.11.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (3.1.3)\r\n",
      "Requirement already satisfied: python-multipart>=0.0.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (0.0.9)\r\n",
      "Requirement already satisfied: ujson!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0,>=4.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (5.10.0)\r\n",
      "Requirement already satisfied: email_validator>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from fastapi>=0.95.2->chromadb) (2.1.1)\r\n",
      "Requirement already satisfied: certifi>=14.05.14 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2024.2.2)\r\n",
      "Requirement already satisfied: six>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.9.0.post0)\r\n",
      "Requirement already satisfied: google-auth>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.29.0)\r\n",
      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (1.8.0)\r\n",
      "Requirement already satisfied: requests-oauthlib in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.0.0)\r\n",
      "Requirement already satisfied: oauthlib>=3.2.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\r\n",
      "Requirement already satisfied: urllib3>=1.24.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from kubernetes>=28.1.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: coloredlogs in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (15.0.1)\r\n",
      "Requirement already satisfied: flatbuffers in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (24.3.25)\r\n",
      "Requirement already satisfied: protobuf in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (4.25.3)\r\n",
      "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\r\n",
      "Requirement already satisfied: deprecated>=1.2.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (1.2.14)\r\n",
      "Requirement already satisfied: importlib-metadata<=7.0,>=6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-api>=1.2.0->chromadb) (7.0.0)\r\n",
      "Requirement already satisfied: googleapis-common-protos~=1.52 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.63.0)\r\n",
      "Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-proto==1.24.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.24.0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation-asgi==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-instrumentation==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-semantic-conventions==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: opentelemetry-util-http==0.45b0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (0.45b0)\r\n",
      "Requirement already satisfied: setuptools>=16.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (69.5.1)\r\n",
      "Requirement already satisfied: wrapt<2.0.0,>=1.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (1.16.0)\r\n",
      "Requirement already satisfied: asgiref~=3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from opentelemetry-instrumentation-asgi==0.45b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (3.8.1)\r\n",
      "Requirement already satisfied: monotonic>=1.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (1.6)\r\n",
      "Requirement already satisfied: backoff>=1.10.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\r\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.3.2)\r\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from requests>=2.28->chromadb) (3.7)\r\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from tokenizers>=0.13.2->chromadb) (0.22.2)\r\n",
      "Requirement already satisfied: click>=8.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (8.1.7)\r\n",
      "Requirement already satisfied: shellingham>=1.3.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (1.5.4)\r\n",
      "Requirement already satisfied: rich>=10.11.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from typer>=0.9.0->chromadb) (13.7.1)\r\n",
      "Requirement already satisfied: h11>=0.8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn>=0.18.3->uvicorn[standard]>=0.18.3->chromadb) (0.14.0)\r\n",
      "Requirement already satisfied: httptools>=0.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.6.1)\r\n",
      "Requirement already satisfied: python-dotenv>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.1)\r\n",
      "Requirement already satisfied: uvloop!=0.15.0,!=0.15.1,>=0.14.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\r\n",
      "Requirement already satisfied: watchfiles>=0.13 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.21.0)\r\n",
      "Requirement already satisfied: websockets>=10.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from uvicorn[standard]>=0.18.3->chromadb) (12.0)\r\n",
      "Requirement already satisfied: dnspython>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from email_validator>=2.0.0->fastapi>=0.95.2->chromadb) (2.6.1)\r\n",
      "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.3)\r\n",
      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.4.0)\r\n",
      "Requirement already satisfied: rsa<5,>=3.1.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\r\n",
      "Requirement already satisfied: anyio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (3.7.1)\r\n",
      "Requirement already satisfied: httpcore==1.* in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.0.5)\r\n",
      "Requirement already satisfied: sniffio in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from httpx>=0.23.0->fastapi>=0.95.2->chromadb) (1.3.1)\r\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (3.13.4)\r\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers>=0.13.2->chromadb) (2024.3.1)\r\n",
      "Requirement already satisfied: zipp>=0.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from importlib-metadata<=7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.18.1)\r\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from jinja2>=2.11.2->fastapi>=0.95.2->chromadb) (2.1.5)\r\n",
      "Requirement already satisfied: markdown-it-py>=2.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (3.0.0)\r\n",
      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from rich>=10.11.0->typer>=0.9.0->chromadb) (2.17.2)\r\n",
      "Requirement already satisfied: humanfriendly>=9.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from coloredlogs->onnxruntime>=1.14.1->chromadb) (10.0)\r\n",
      "Requirement already satisfied: mpmath>=0.19 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\r\n",
      "Requirement already satisfied: mdurl~=0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer>=0.9.0->chromadb) (0.1.2)\r\n",
      "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.6.0)\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1.2\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Looking in indexes: https://mirrors.aliyun.com/pypi/simple/\r\n",
      "Collecting lark\r\n",
      "  Downloading https://mirrors.aliyun.com/pypi/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl (111 kB)\r\n",
      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m111.7/111.7 kB\u001B[0m \u001B[31m2.6 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0ma \u001B[36m0:00:01\u001B[0m\r\n",
      "\u001B[?25hInstalling collected packages: lark\r\n",
      "Successfully installed lark-1.1.9\r\n",
      "\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m24.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.1.2\u001B[0m\r\n",
      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip3 install --upgrade pip\u001B[0m\r\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install chromadb\n",
    "%pip install lark"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-15T15:17:20.340918Z",
     "start_time": "2024-07-15T15:17:15.463446Z"
    }
   },
   "id": "eb8e8d7ddf9a7f60",
   "execution_count": 1
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n",
      "  warn_deprecated(\n"
     ]
    }
   ],
   "source": [
    "def pretty_print_docs(docs):\n",
    "    print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))\n",
    "\n",
    "from langchain.schema import Document\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.vectorstores import Chroma\n",
    "\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
    "docs = [\n",
    "    Document(\n",
    "        page_content=\"一群科学家带回了恐龙，然后混乱随之而来\",\n",
    "        metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\", \"director\": \"Unknown\"},\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"莱昂纳多·迪卡普里奥迷失在梦中的梦中的梦中...\",\n",
    "        metadata={\"year\": 2010, \"director\": \"克里斯托弗·诺兰\", \"rating\": 8.2},\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"一名心理学家迷失在一系列的梦中，梦中的梦，梦中的梦，然后《盗梦空间》使用了这个想法\",\n",
    "        metadata={\"year\": 2006, \"director\": \"今敏\", \"rating\": 8.6},\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"一群女人非常健康，一些男人对她们充满了向往\",\n",
    "        metadata={\"year\": 2019, \"director\": \"格雷塔·葛韦格\", \"rating\": 8.3},\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"玩具变得活灵活现，他们玩得很开心\",\n",
    "        metadata={\"year\": 1995, \"genre\": \"animated\", \"director\": \"Unknown\"},\n",
    "    ),\n",
    "    Document(\n",
    "        page_content=\"三个男人走进区域，三个男人从区域中走出\",\n",
    "        metadata={\n",
    "            \"year\": 1979,\n",
    "            \"rating\": 9.9,\n",
    "            \"director\": \"安德烈·塔可夫斯基\",\n",
    "            \"genre\": \"science fiction\",\n",
    "            \"rating\": 9.9,\n",
    "        },\n",
    "    ),\n",
    "]\n",
    "\n",
    "vectorstore = Chroma.from_documents(docs,embeddings)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-15T15:24:02.810569Z",
     "start_time": "2024-07-15T15:23:58.891885Z"
    }
   },
   "id": "62ddcb82eba3fab0",
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Document 1:\n",
      "\n",
      "一群科学家带回了恐龙，然后混乱随之而来\n",
      "----------------------------------------------------------------------------------------------------\n",
      "Document 2:\n",
      "\n",
      "玩具变得活灵活现，他们玩得很开心\n",
      "----------------------------------------------------------------------------------------------------\n",
      "Document 3:\n",
      "\n",
      "一名心理学家迷失在一系列的梦中，梦中的梦，梦中的梦，然后《盗梦空间》使用了这个想法\n",
      "----------------------------------------------------------------------------------------------------\n",
      "Document 4:\n",
      "\n",
      "三个男人走进区域，三个男人从区域中走出\n",
      "Document 1:\n",
      "\n",
      "一名心理学家迷失在一系列的梦中，梦中的梦，梦中的梦，然后《盗梦空间》使用了这个想法\n",
      "----------------------------------------------------------------------------------------------------\n",
      "Document 2:\n",
      "\n",
      "三个男人走进区域，三个男人从区域中走出\n"
     ]
    }
   ],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
    "from langchain.chains.query_constructor.base import AttributeInfo\n",
    "\n",
    "\n",
    "metadata_field_info = [\n",
    "    AttributeInfo(\n",
    "        name=\"genre\",\n",
    "        description=\"电影的类型\",\n",
    "        type=\"string or list[string]\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"year\",\n",
    "        description=\"电影发布的年份\",\n",
    "        type=\"integer\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"director\",\n",
    "        description=\"导演的名字\",\n",
    "        type=\"string\",\n",
    "    ),\n",
    "    AttributeInfo(\n",
    "        name=\"rating\", description=\"电影的评分（1-10）\", type=\"float\"\n",
    "    ),\n",
    "]\n",
    "document_content_description = \"电影摘要\"\n",
    "llm = ChatOpenAI(temperature=0)\n",
    "retriever = SelfQueryRetriever.from_llm(\n",
    "    llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
    ")\n",
    "answer = retriever.get_relevant_documents(\"哪些电影提到了恐龙?\")\n",
    "pretty_print_docs(answer)\n",
    "\n",
    "answer1 = retriever.get_relevant_documents(\"我要看评分超过8.5 和梦有关的电影\")\n",
    "pretty_print_docs(answer1)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-15T15:28:30.026616Z",
     "start_time": "2024-07-15T15:28:24.001660Z"
    }
   },
   "id": "a336300e4114865",
   "execution_count": 5
  },
  {
   "cell_type": "markdown",
   "source": [
    "# 17.time weight"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "f36b5bc459287c3d"
  },
  {
   "cell_type": "code",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.\n"
     ]
    },
    {
     "data": {
      "text/plain": "[Document(page_content='hello world', metadata={'last_accessed_at': datetime.datetime(2024, 7, 15, 23, 49, 45, 245596), 'created_at': datetime.datetime(2024, 7, 15, 23, 49, 43, 726909), 'buffer_idx': 0})]"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import faiss\n",
    "\n",
    "from datetime import datetime, timedelta\n",
    "from langchain.docstore import InMemoryDocstore\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.retrievers import TimeWeightedVectorStoreRetriever\n",
    "from langchain.schema import Document\n",
    "from langchain.vectorstores import FAISS\n",
    "\n",
    "embedding_model = OpenAIEmbeddings()\n",
    "embedding_size = 1536\n",
    "index = faiss.IndexFlatL2(embedding_size)\n",
    "vectorstore = FAISS(embedding_model.embed_query, index, InMemoryDocstore({}), {})\n",
    "retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=.0000000000000000000000001, k=1)\n",
    "\n",
    "yesterday = datetime.now() - timedelta(days = 1)\n",
    "retriever.add_documents([Document(page_content=\"hello world\", metadata={\"last_accessed_at\": yesterday})])\n",
    "retriever.add_documents([Document(page_content=\"hello food\")])\n",
    "\n",
    "retriever.get_relevant_documents(\"hello world\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-15T15:49:45.256242Z",
     "start_time": "2024-07-15T15:49:43.726005Z"
    }
   },
   "id": "8984a21e4b67c0e0",
   "execution_count": 9
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
